/*
 * Copyright (c) 2018-2019 Arm Limited.
 *
 * SPDX-License-Identifier: MIT
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to
 * deal in the Software without restriction, including without limitation the
 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
 * sell copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */
#ifdef __aarch64__

#include <algorithm>

#include "arm_gemm.hpp"

#include "../../asmlib.hpp"
#include "../../utils.hpp"

namespace arm_gemm {

void a64_hybrid_fp32_mla_4x8(const float *A, int lda, const float *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool append) {
    const int K_stride = K;
    const long loops_count = ((K + 4) / 8) - 1;
    K -= loops_count * 8;
    const long regs_count = (K / 4) - 1;
    K -= (regs_count + 1) * 4;
    const long blocks_count = K / 1;
    float nullbias[4];
    if (!append && !bias) {
        memset(nullbias, 0, (4 * sizeof(float)));
    }
    float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
    float maxval =   static_cast<float>(std::numeric_limits<float>::infinity());
    const float * const minptr = &minval;
    const float * const maxptr = &maxval;

    switch(act.type)
    {
        default:
        case Activation::Type::None:
            break;
        case Activation::Type::BoundedReLU:
            maxval = static_cast<float>(act.param1);
            /* fall through */
        case Activation::Type::ReLU:
            minval = 0.0f;
            break;
    }

    for (int y=0; y<M; y+=8) {
        const float * const a_ptr0_base = A + (y * lda);
        const unsigned long ldab = lda * sizeof(float);

        float *c_ptr0 = C + (y * ldc);

        for (int x0=0; x0<N; x0+=4ul) {
            const long width = std::min((unsigned long)N-x0, 4ul);
            long loops = loops_count;
            long regs = regs_count;
            long blocks = blocks_count;
            const float *a_ptr0 = a_ptr0_base;
            const float *b_ptr0 = B + (K_stride * x0);
            const bool use_result_buffer = (width < 4);
            float result_buffer[32];
            const unsigned long ldcb = (use_result_buffer ? 4 : ldc) * sizeof(float);
            float *c_ptr_real = c_ptr0;
            if (use_result_buffer && append) {
                for(int cy=0; cy<std::min(M-y, 8); cy++) {
                    for(unsigned int cx=0; cx<width; cx++) {
                        result_buffer[cy * 4 + cx] = c_ptr_real[cy * ldc + cx];
                    }
                }
            }
            if (use_result_buffer) {
                c_ptr0 = result_buffer;
            }
            const float *biasptr = bias ? bias+x0 : nullbias;

            switch(M-y) {
                case 1:
                    __asm __volatile (
                        "ldr q24, [%[biasptr]]\n"
                        "ldr q0, [%[a_ptr0]]\n"
                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
                        "ldr q16, [%[b_ptr0]]\n"
                        "ldr q17, [%[b_ptr0], #0x10]\n"
                        "ldr q18, [%[b_ptr0], #0x20]\n"
                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
                        "cbz %[loops], 1f\n"
                        "2:\n"
                        "fmla v24.4s, v16.4s, v0.s[0]\n"
                        "ldr q19, [%[b_ptr0], #-0x10]\n"
                        "ldr q8, [%[a_ptr0]]\n"
                        "subs %[loops], %[loops], #0x1\n"
                        "ldr q16, [%[b_ptr0]]\n"
                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
                        "fmla v24.4s, v17.4s, v0.s[1]\n"
                        "ldr q17, [%[b_ptr0], #0x10]\n"
                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
                        "fmla v24.4s, v18.4s, v0.s[2]\n"
                        "ldr q18, [%[b_ptr0], #0x20]\n"
                        "fmla v24.4s, v19.4s, v0.s[3]\n"
                        "ldr q19, [%[b_ptr0], #0x30]\n"
                        "ldr q0, [%[a_ptr0], #-0x10]\n"
                        "fmla v24.4s, v16.4s, v8.s[0]\n"
                        "ldr q16, [%[b_ptr0], #0x40]\n"
                        "fmla v24.4s, v17.4s, v8.s[1]\n"
                        "ldr q17, [%[b_ptr0], #0x50]\n"
                        "fmla v24.4s, v18.4s, v8.s[2]\n"
                        "ldr q18, [%[b_ptr0], #0x60]\n"
                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
                        "fmla v24.4s, v19.4s, v8.s[3]\n"
                        "b.ne 2b\n"
                        "1:\n"
                        "ldr q19, [%[b_ptr0], #-0x10]\n"
                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
                        "cbz %[regs], 3f\n"
                        "fmla v24.4s, v16.4s, v0.s[0]\n"
                        "ldr q8, [%[a_ptr0]]\n"
                        "ldr q16, [%[b_ptr0]]\n"
                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
                        "fmla v24.4s, v17.4s, v0.s[1]\n"
                        "ldr q17, [%[b_ptr0], #0x10]\n"
                        "fmla v24.4s, v18.4s, v0.s[2]\n"
                        "ldr q18, [%[b_ptr0], #0x20]\n"
                        "fmla v24.4s, v19.4s, v0.s[3]\n"
                        "ldr q19, [%[b_ptr0], #0x30]\n"
                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
                        "fmla v24.4s, v16.4s, v8.s[0]\n"
                        "fmla v24.4s, v17.4s, v8.s[1]\n"
                        "fmla v24.4s, v18.4s, v8.s[2]\n"
                        "fmla v24.4s, v19.4s, v8.s[3]\n"
                        "b 4f\n"
                        "3:\n"
                        "fmla v24.4s, v16.4s, v0.s[0]\n"
                        "fmla v24.4s, v17.4s, v0.s[1]\n"
                        "fmla v24.4s, v18.4s, v0.s[2]\n"
                        "fmla v24.4s, v19.4s, v0.s[3]\n"
                        "4:\n"
                        "cbz %[blocks], 5f\n"
                        "6:\n"
                        "ldr q16, [%[b_ptr0]]\n"
                        "subs %[blocks], %[blocks], #0x1\n"
                        "add %[b_ptr0], %[b_ptr0], #0x10\n"
                        "ldr s0, [%[a_ptr0]]\n"
                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
                        "fmla v24.4s, v16.4s, v0.s[0]\n"
                        "b.ne 6b\n"
                        "5:\n"
                        "ld1r {v22.4s}, [%[minptr]]\n"
                        "ld1r {v23.4s}, [%[maxptr]]\n"
                        "fmax v24.4s, v24.4s, v22.4s\n"
                        "fmin v24.4s, v24.4s, v23.4s\n"
                        "str q24, [%[c_ptr0]]\n"
                        "add %[c_ptr0], %[c_ptr0], #0x10\n"
                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
                    );
                    break;
                case 2:
                    __asm __volatile (
                        "a_ptr1 .req X0\n"
                        "c_ptr1 .req X1\n"
                        "ldr q24, [%[biasptr]]\n"
                        "add a_ptr1, %[a_ptr0], %[lda]\n"
                        "ldr q0, [%[a_ptr0]]\n"
                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
                        "mov v25.16b, v24.16b\n"
                        "ldr q1, [a_ptr1]\n"
                        "ldr q16, [%[b_ptr0]]\n"
                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
                        "ldr q17, [%[b_ptr0], #0x10]\n"
                        "add a_ptr1, a_ptr1, #0x10\n"
                        "ldr q18, [%[b_ptr0], #0x20]\n"
                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
                        "cbz %[loops], 1f\n"
                        "2:\n"
                        "fmla v24.4s, v16.4s, v0.s[0]\n"
                        "ldr q19, [%[b_ptr0], #-0x10]\n"
                        "fmla v25.4s, v16.4s, v1.s[0]\n"
                        "ldr q8, [%[a_ptr0]]\n"
                        "ldr q9, [a_ptr1]\n"
                        "subs %[loops], %[loops], #0x1\n"
                        "fmla v24.4s, v17.4s, v0.s[1]\n"
                        "ldr q16, [%[b_ptr0]]\n"
                        "fmla v25.4s, v17.4s, v1.s[1]\n"
                        "ldr q17, [%[b_ptr0], #0x10]\n"
                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
                        "fmla v24.4s, v18.4s, v0.s[2]\n"
                        "add a_ptr1, a_ptr1, #0x20\n"
                        "fmla v25.4s, v18.4s, v1.s[2]\n"
                        "ldr q18, [%[b_ptr0], #0x20]\n"
                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
                        "fmla v24.4s, v19.4s, v0.s[3]\n"
                        "ldr q0, [%[a_ptr0], #-0x10]\n"
                        "fmla v25.4s, v19.4s, v1.s[3]\n"
                        "ldr q19, [%[b_ptr0], #0x30]\n"
                        "ldr q1, [a_ptr1, #-0x10]\n"
                        "fmla v24.4s, v16.4s, v8.s[0]\n"
                        "fmla v25.4s, v16.4s, v9.s[0]\n"
                        "ldr q16, [%[b_ptr0], #0x40]\n"
                        "fmla v24.4s, v17.4s, v8.s[1]\n"
                        "fmla v25.4s, v17.4s, v9.s[1]\n"
                        "ldr q17, [%[b_ptr0], #0x50]\n"
                        "fmla v24.4s, v18.4s, v8.s[2]\n"
                        "fmla v25.4s, v18.4s, v9.s[2]\n"
                        "ldr q18, [%[b_ptr0], #0x60]\n"
                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
                        "fmla v24.4s, v19.4s, v8.s[3]\n"
                        "fmla v25.4s, v19.4s, v9.s[3]\n"
                        "b.ne 2b\n"
                        "1:\n"
                        "ldr q19, [%[b_ptr0], #-0x10]\n"
                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
                        "prfm PSTL1KEEP, [c_ptr1]\n"
                        "cbz %[regs], 3f\n"
                        "fmla v24.4s, v16.4s, v0.s[0]\n"
                        "ldr q8, [%[a_ptr0]]\n"
                        "fmla v25.4s, v16.4s, v1.s[0]\n"
                        "ldr q9, [a_ptr1]\n"
                        "ldr q16, [%[b_ptr0]]\n"
                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
                        "fmla v24.4s, v17.4s, v0.s[1]\n"
                        "add a_ptr1, a_ptr1, #0x10\n"
                        "fmla v25.4s, v17.4s, v1.s[1]\n"
                        "ldr q17, [%[b_ptr0], #0x10]\n"
                        "fmla v24.4s, v18.4s, v0.s[2]\n"
                        "fmla v25.4s, v18.4s, v1.s[2]\n"
                        "ldr q18, [%[b_ptr0], #0x20]\n"
                        "fmla v24.4s, v19.4s, v0.s[3]\n"
                        "fmla v25.4s, v19.4s, v1.s[3]\n"
                        "ldr q19, [%[b_ptr0], #0x30]\n"
                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
                        "fmla v24.4s, v16.4s, v8.s[0]\n"
                        "fmla v25.4s, v16.4s, v9.s[0]\n"
                        "fmla v24.4s, v17.4s, v8.s[1]\n"
                        "fmla v25.4s, v17.4s, v9.s[1]\n"
                        "fmla v24.4s, v18.4s, v8.s[2]\n"
                        "fmla v25.4s, v18.4s, v9.s[2]\n"
                        "fmla v24.4s, v19.4s, v8.s[3]\n"
                        "fmla v25.4s, v19.4s, v9.s[3]\n"
                        "b 4f\n"
                        "3:\n"
                        "fmla v24.4s, v16.4s, v0.s[0]\n"
                        "fmla v25.4s, v16.4s, v1.s[0]\n"
                        "fmla v24.4s, v17.4s, v0.s[1]\n"
                        "fmla v25.4s, v17.4s, v1.s[1]\n"
                        "fmla v24.4s, v18.4s, v0.s[2]\n"
                        "fmla v25.4s, v18.4s, v1.s[2]\n"
                        "fmla v24.4s, v19.4s, v0.s[3]\n"
                        "fmla v25.4s, v19.4s, v1.s[3]\n"
                        "4:\n"
                        "cbz %[blocks], 5f\n"
                        "6:\n"
                        "ldr q16, [%[b_ptr0]]\n"
                        "subs %[blocks], %[blocks], #0x1\n"
                        "add %[b_ptr0], %[b_ptr0], #0x10\n"
                        "ldr s0, [%[a_ptr0]]\n"
                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
                        "ldr s1, [a_ptr1]\n"
                        "add a_ptr1, a_ptr1, #0x4\n"
                        "fmla v24.4s, v16.4s, v0.s[0]\n"
                        "fmla v25.4s, v16.4s, v1.s[0]\n"
                        "b.ne 6b\n"
                        "5:\n"
                        "ld1r {v22.4s}, [%[minptr]]\n"
                        "ld1r {v23.4s}, [%[maxptr]]\n"
                        "fmax v24.4s, v24.4s, v22.4s\n"
                        "fmax v25.4s, v25.4s, v22.4s\n"
                        "fmin v24.4s, v24.4s, v23.4s\n"
                        "fmin v25.4s, v25.4s, v23.4s\n"
                        "str q24, [%[c_ptr0]]\n"
                        "add %[c_ptr0], %[c_ptr0], #0x10\n"
                        "str q25, [c_ptr1]\n"
                        ".unreq a_ptr1\n"
                        ".unreq c_ptr1\n"
                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "cc", "memory"
                    );
                    break;
                case 3:
                    __asm __volatile (
                        "a_ptr1 .req X0\n"
                        "a_ptr2 .req X1\n"
                        "c_ptr1 .req X2\n"
                        "c_ptr2 .req X3\n"
                        "ldr q24, [%[biasptr]]\n"
                        "add a_ptr1, %[a_ptr0], %[lda]\n"
                        "ldr q0, [%[a_ptr0]]\n"
                        "add a_ptr2, a_ptr1, %[lda]\n"
                        "mov v25.16b, v24.16b\n"
                        "ldr q1, [a_ptr1]\n"
                        "mov v26.16b, v24.16b\n"
                        "ldr q2, [a_ptr2]\n"
                        "ldr q16, [%[b_ptr0]]\n"
                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
                        "ldr q17, [%[b_ptr0], #0x10]\n"
                        "add c_ptr2, c_ptr1, %[ldc]\n"
                        "ldr q18, [%[b_ptr0], #0x20]\n"
                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
                        "add a_ptr1, a_ptr1, #0x10\n"
                        "add a_ptr2, a_ptr2, #0x10\n"
                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
                        "cbz %[loops], 1f\n"
                        "2:\n"
                        "fmla v24.4s, v16.4s, v0.s[0]\n"
                        "ldr q19, [%[b_ptr0], #-0x10]\n"
                        "fmla v25.4s, v16.4s, v1.s[0]\n"
                        "ldr q8, [%[a_ptr0]]\n"
                        "fmla v26.4s, v16.4s, v2.s[0]\n"
                        "ldr q9, [a_ptr1]\n"
                        "ldr q10, [a_ptr2]\n"
                        "subs %[loops], %[loops], #0x1\n"
                        "fmla v24.4s, v17.4s, v0.s[1]\n"
                        "ldr q16, [%[b_ptr0]]\n"
                        "fmla v25.4s, v17.4s, v1.s[1]\n"
                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
                        "fmla v26.4s, v17.4s, v2.s[1]\n"
                        "ldr q17, [%[b_ptr0], #0x10]\n"
                        "fmla v24.4s, v18.4s, v0.s[2]\n"
                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
                        "fmla v25.4s, v18.4s, v1.s[2]\n"
                        "add a_ptr1, a_ptr1, #0x20\n"
                        "fmla v26.4s, v18.4s, v2.s[2]\n"
                        "ldr q18, [%[b_ptr0], #0x20]\n"
                        "fmla v24.4s, v19.4s, v0.s[3]\n"
                        "ldr q0, [%[a_ptr0], #-0x10]\n"
                        "fmla v25.4s, v19.4s, v1.s[3]\n"
                        "ldr q1, [a_ptr1, #-0x10]\n"
                        "add a_ptr2, a_ptr2, #0x20\n"
                        "fmla v26.4s, v19.4s, v2.s[3]\n"
                        "ldr q19, [%[b_ptr0], #0x30]\n"
                        "fmla v24.4s, v16.4s, v8.s[0]\n"
                        "ldr q2, [a_ptr2, #-0x10]\n"
                        "fmla v25.4s, v16.4s, v9.s[0]\n"
                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
                        "fmla v26.4s, v16.4s, v10.s[0]\n"
                        "ldr q16, [%[b_ptr0], #0x40]\n"
                        "fmla v24.4s, v17.4s, v8.s[1]\n"
                        "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
                        "fmla v25.4s, v17.4s, v9.s[1]\n"
                        "fmla v26.4s, v17.4s, v10.s[1]\n"
                        "ldr q17, [%[b_ptr0], #0x50]\n"
                        "fmla v24.4s, v18.4s, v8.s[2]\n"
                        "fmla v25.4s, v18.4s, v9.s[2]\n"
                        "fmla v26.4s, v18.4s, v10.s[2]\n"
                        "ldr q18, [%[b_ptr0], #0x60]\n"
                        "fmla v24.4s, v19.4s, v8.s[3]\n"
                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
                        "fmla v25.4s, v19.4s, v9.s[3]\n"
                        "fmla v26.4s, v19.4s, v10.s[3]\n"
                        "b.ne 2b\n"
                        "1:\n"
                        "ldr q19, [%[b_ptr0], #-0x10]\n"
                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
                        "prfm PSTL1KEEP, [c_ptr1]\n"
                        "prfm PSTL1KEEP, [c_ptr2]\n"
                        "cbz %[regs], 3f\n"
                        "fmla v24.4s, v16.4s, v0.s[0]\n"
                        "ldr q8, [%[a_ptr0]]\n"
                        "fmla v25.4s, v16.4s, v1.s[0]\n"
                        "ldr q9, [a_ptr1]\n"
                        "fmla v26.4s, v16.4s, v2.s[0]\n"
                        "ldr q10, [a_ptr2]\n"
                        "ldr q16, [%[b_ptr0]]\n"
                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
                        "fmla v24.4s, v17.4s, v0.s[1]\n"
                        "add a_ptr1, a_ptr1, #0x10\n"
                        "fmla v25.4s, v17.4s, v1.s[1]\n"
                        "add a_ptr2, a_ptr2, #0x10\n"
                        "fmla v26.4s, v17.4s, v2.s[1]\n"
                        "ldr q17, [%[b_ptr0], #0x10]\n"
                        "fmla v24.4s, v18.4s, v0.s[2]\n"
                        "fmla v25.4s, v18.4s, v1.s[2]\n"
                        "fmla v26.4s, v18.4s, v2.s[2]\n"
                        "ldr q18, [%[b_ptr0], #0x20]\n"
                        "fmla v24.4s, v19.4s, v0.s[3]\n"
                        "fmla v25.4s, v19.4s, v1.s[3]\n"
                        "fmla v26.4s, v19.4s, v2.s[3]\n"
                        "ldr q19, [%[b_ptr0], #0x30]\n"
                        "fmla v24.4s, v16.4s, v8.s[0]\n"
                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
                        "fmla v25.4s, v16.4s, v9.s[0]\n"
                        "fmla v26.4s, v16.4s, v10.s[0]\n"
                        "fmla v24.4s, v17.4s, v8.s[1]\n"
                        "fmla v25.4s, v17.4s, v9.s[1]\n"
                        "fmla v26.4s, v17.4s, v10.s[1]\n"
                        "fmla v24.4s, v18.4s, v8.s[2]\n"
                        "fmla v25.4s, v18.4s, v9.s[2]\n"
                        "fmla v26.4s, v18.4s, v10.s[2]\n"
                        "fmla v24.4s, v19.4s, v8.s[3]\n"
                        "fmla v25.4s, v19.4s, v9.s[3]\n"
                        "fmla v26.4s, v19.4s, v10.s[3]\n"
                        "b 4f\n"
                        "3:\n"
                        "fmla v24.4s, v16.4s, v0.s[0]\n"
                        "fmla v25.4s, v16.4s, v1.s[0]\n"
                        "fmla v26.4s, v16.4s, v2.s[0]\n"
                        "fmla v24.4s, v17.4s, v0.s[1]\n"
                        "fmla v25.4s, v17.4s, v1.s[1]\n"
                        "fmla v26.4s, v17.4s, v2.s[1]\n"
                        "fmla v24.4s, v18.4s, v0.s[2]\n"
                        "fmla v25.4s, v18.4s, v1.s[2]\n"
                        "fmla v26.4s, v18.4s, v2.s[2]\n"
                        "fmla v24.4s, v19.4s, v0.s[3]\n"
                        "fmla v25.4s, v19.4s, v1.s[3]\n"
                        "fmla v26.4s, v19.4s, v2.s[3]\n"
                        "4:\n"
                        "cbz %[blocks], 5f\n"
                        "6:\n"
                        "ldr q16, [%[b_ptr0]]\n"
                        "subs %[blocks], %[blocks], #0x1\n"
                        "add %[b_ptr0], %[b_ptr0], #0x10\n"
                        "ldr s0, [%[a_ptr0]]\n"
                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
                        "ldr s1, [a_ptr1]\n"
                        "add a_ptr1, a_ptr1, #0x4\n"
                        "fmla v24.4s, v16.4s, v0.s[0]\n"
                        "ldr s2, [a_ptr2]\n"
                        "fmla v25.4s, v16.4s, v1.s[0]\n"
                        "add a_ptr2, a_ptr2, #0x4\n"
                        "fmla v26.4s, v16.4s, v2.s[0]\n"
                        "b.ne 6b\n"
                        "5:\n"
                        "ld1r {v22.4s}, [%[minptr]]\n"
                        "ld1r {v23.4s}, [%[maxptr]]\n"
                        "fmax v24.4s, v24.4s, v22.4s\n"
                        "fmax v25.4s, v25.4s, v22.4s\n"
                        "fmax v26.4s, v26.4s, v22.4s\n"
                        "fmin v24.4s, v24.4s, v23.4s\n"
                        "fmin v25.4s, v25.4s, v23.4s\n"
                        "fmin v26.4s, v26.4s, v23.4s\n"
                        "str q24, [%[c_ptr0]]\n"
                        "add %[c_ptr0], %[c_ptr0], #0x10\n"
                        "str q25, [c_ptr1]\n"
                        "str q26, [c_ptr2]\n"
                        ".unreq a_ptr1\n"
                        ".unreq a_ptr2\n"
                        ".unreq c_ptr1\n"
                        ".unreq c_ptr2\n"
                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory"
                    );
                    break;
                case 4:
                    __asm __volatile (
                        "a_ptr1 .req X0\n"
                        "a_ptr2 .req X1\n"
                        "a_ptr3 .req X2\n"
                        "c_ptr1 .req X3\n"
                        "c_ptr2 .req X4\n"
                        "c_ptr3 .req X5\n"
                        "ldr q24, [%[biasptr]]\n"
                        "add a_ptr1, %[a_ptr0], %[lda]\n"
                        "ldr q0, [%[a_ptr0]]\n"
                        "add a_ptr2, a_ptr1, %[lda]\n"
                        "mov v25.16b, v24.16b\n"
                        "ldr q1, [a_ptr1]\n"
                        "mov v26.16b, v24.16b\n"
                        "ldr q2, [a_ptr2]\n"
                        "mov v27.16b, v24.16b\n"
                        "ldr q16, [%[b_ptr0]]\n"
                        "ldr q17, [%[b_ptr0], #0x10]\n"
                        "add a_ptr3, a_ptr2, %[lda]\n"
                        "ldr q18, [%[b_ptr0], #0x20]\n"
                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
                        "ldr q3, [a_ptr3]\n"
                        "add c_ptr2, c_ptr1, %[ldc]\n"
                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
                        "add c_ptr3, c_ptr2, %[ldc]\n"
                        "add a_ptr1, a_ptr1, #0x10\n"
                        "add a_ptr2, a_ptr2, #0x10\n"
                        "add a_ptr3, a_ptr3, #0x10\n"
                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
                        "cbz %[loops], 1f\n"
                        "2:\n"
                        "fmla v24.4s, v16.4s, v0.s[0]\n"
                        "ldr q19, [%[b_ptr0], #-0x10]\n"
                        "fmla v25.4s, v16.4s, v1.s[0]\n"
                        "ldr q8, [%[a_ptr0]]\n"
                        "fmla v26.4s, v16.4s, v2.s[0]\n"
                        "ldr q9, [a_ptr1]\n"
                        "fmla v27.4s, v16.4s, v3.s[0]\n"
                        "ldr q10, [a_ptr2]\n"
                        "fmla v24.4s, v17.4s, v0.s[1]\n"
                        "ldr q11, [a_ptr3]\n"
                        "fmla v25.4s, v17.4s, v1.s[1]\n"
                        "ldr q16, [%[b_ptr0]]\n"
                        "fmla v26.4s, v17.4s, v2.s[1]\n"
                        "subs %[loops], %[loops], #0x1\n"
                        "fmla v27.4s, v17.4s, v3.s[1]\n"
                        "ldr q17, [%[b_ptr0], #0x10]\n"
                        "fmla v24.4s, v18.4s, v0.s[2]\n"
                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
                        "fmla v25.4s, v18.4s, v1.s[2]\n"
                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
                        "fmla v26.4s, v18.4s, v2.s[2]\n"
                        "add a_ptr1, a_ptr1, #0x20\n"
                        "fmla v27.4s, v18.4s, v3.s[2]\n"
                        "ldr q18, [%[b_ptr0], #0x20]\n"
                        "fmla v24.4s, v19.4s, v0.s[3]\n"
                        "ldr q0, [%[a_ptr0], #-0x10]\n"
                        "fmla v25.4s, v19.4s, v1.s[3]\n"
                        "ldr q1, [a_ptr1, #-0x10]\n"
                        "fmla v26.4s, v19.4s, v2.s[3]\n"
                        "add a_ptr2, a_ptr2, #0x20\n"
                        "fmla v27.4s, v19.4s, v3.s[3]\n"
                        "ldr q19, [%[b_ptr0], #0x30]\n"
                        "fmla v24.4s, v16.4s, v8.s[0]\n"
                        "ldr q2, [a_ptr2, #-0x10]\n"
                        "fmla v25.4s, v16.4s, v9.s[0]\n"
                        "add a_ptr3, a_ptr3, #0x20\n"
                        "fmla v26.4s, v16.4s, v10.s[0]\n"
                        "ldr q3, [a_ptr3, #-0x10]\n"
                        "fmla v27.4s, v16.4s, v11.s[0]\n"
                        "ldr q16, [%[b_ptr0], #0x40]\n"
                        "fmla v24.4s, v17.4s, v8.s[1]\n"
                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
                        "fmla v25.4s, v17.4s, v9.s[1]\n"
                        "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
                        "fmla v26.4s, v17.4s, v10.s[1]\n"
                        "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
                        "fmla v27.4s, v17.4s, v11.s[1]\n"
                        "ldr q17, [%[b_ptr0], #0x50]\n"
                        "fmla v24.4s, v18.4s, v8.s[2]\n"
                        "fmla v25.4s, v18.4s, v9.s[2]\n"
                        "fmla v26.4s, v18.4s, v10.s[2]\n"
                        "fmla v27.4s, v18.4s, v11.s[2]\n"
                        "ldr q18, [%[b_ptr0], #0x60]\n"
                        "fmla v24.4s, v19.4s, v8.s[3]\n"
                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
                        "fmla v25.4s, v19.4s, v9.s[3]\n"
                        "fmla v26.4s, v19.4s, v10.s[3]\n"
                        "fmla v27.4s, v19.4s, v11.s[3]\n"
                        "b.ne 2b\n"
                        "1:\n"
                        "ldr q19, [%[b_ptr0], #-0x10]\n"
                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
                        "prfm PSTL1KEEP, [c_ptr1]\n"
                        "prfm PSTL1KEEP, [c_ptr2]\n"
                        "prfm PSTL1KEEP, [c_ptr3]\n"
                        "cbz %[regs], 3f\n"
                        "fmla v24.4s, v16.4s, v0.s[0]\n"
                        "ldr q8, [%[a_ptr0]]\n"
                        "fmla v25.4s, v16.4s, v1.s[0]\n"
                        "ldr q9, [a_ptr1]\n"
                        "fmla v26.4s, v16.4s, v2.s[0]\n"
                        "ldr q10, [a_ptr2]\n"
                        "fmla v27.4s, v16.4s, v3.s[0]\n"
                        "ldr q11, [a_ptr3]\n"
                        "fmla v24.4s, v17.4s, v0.s[1]\n"
                        "ldr q16, [%[b_ptr0]]\n"
                        "fmla v25.4s, v17.4s, v1.s[1]\n"
                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
                        "fmla v26.4s, v17.4s, v2.s[1]\n"
                        "add a_ptr1, a_ptr1, #0x10\n"
                        "fmla v27.4s, v17.4s, v3.s[1]\n"
                        "ldr q17, [%[b_ptr0], #0x10]\n"
                        "fmla v24.4s, v18.4s, v0.s[2]\n"
                        "add a_ptr2, a_ptr2, #0x10\n"
                        "fmla v25.4s, v18.4s, v1.s[2]\n"
                        "add a_ptr3, a_ptr3, #0x10\n"
                        "fmla v26.4s, v18.4s, v2.s[2]\n"
                        "fmla v27.4s, v18.4s, v3.s[2]\n"
                        "ldr q18, [%[b_ptr0], #0x20]\n"
                        "fmla v24.4s, v19.4s, v0.s[3]\n"
                        "fmla v25.4s, v19.4s, v1.s[3]\n"
                        "fmla v26.4s, v19.4s, v2.s[3]\n"
                        "fmla v27.4s, v19.4s, v3.s[3]\n"
                        "ldr q19, [%[b_ptr0], #0x30]\n"
                        "fmla v24.4s, v16.4s, v8.s[0]\n"
                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
                        "fmla v25.4s, v16.4s, v9.s[0]\n"
                        "fmla v26.4s, v16.4s, v10.s[0]\n"
                        "fmla v27.4s, v16.4s, v11.s[0]\n"
                        "fmla v24.4s, v17.4s, v8.s[1]\n"
                        "fmla v25.4s, v17.4s, v9.s[1]\n"
                        "fmla v26.4s, v17.4s, v10.s[1]\n"
                        "fmla v27.4s, v17.4s, v11.s[1]\n"
                        "fmla v24.4s, v18.4s, v8.s[2]\n"
                        "fmla v25.4s, v18.4s, v9.s[2]\n"
                        "fmla v26.4s, v18.4s, v10.s[2]\n"
                        "fmla v27.4s, v18.4s, v11.s[2]\n"
                        "fmla v24.4s, v19.4s, v8.s[3]\n"
                        "fmla v25.4s, v19.4s, v9.s[3]\n"
                        "fmla v26.4s, v19.4s, v10.s[3]\n"
                        "fmla v27.4s, v19.4s, v11.s[3]\n"
                        "b 4f\n"
                        "3:\n"
                        "fmla v24.4s, v16.4s, v0.s[0]\n"
                        "fmla v25.4s, v16.4s, v1.s[0]\n"
                        "fmla v26.4s, v16.4s, v2.s[0]\n"
                        "fmla v27.4s, v16.4s, v3.s[0]\n"
                        "fmla v24.4s, v17.4s, v0.s[1]\n"
                        "fmla v25.4s, v17.4s, v1.s[1]\n"
                        "fmla v26.4s, v17.4s, v2.s[1]\n"
                        "fmla v27.4s, v17.4s, v3.s[1]\n"
                        "fmla v24.4s, v18.4s, v0.s[2]\n"
                        "fmla v25.4s, v18.4s, v1.s[2]\n"
                        "fmla v26.4s, v18.4s, v2.s[2]\n"
                        "fmla v27.4s, v18.4s, v3.s[2]\n"
                        "fmla v24.4s, v19.4s, v0.s[3]\n"
                        "fmla v25.4s, v19.4s, v1.s[3]\n"
                        "fmla v26.4s, v19.4s, v2.s[3]\n"
                        "fmla v27.4s, v19.4s, v3.s[3]\n"
                        "4:\n"
                        "cbz %[blocks], 5f\n"
                        "6:\n"
                        "ldr q16, [%[b_ptr0]]\n"
                        "subs %[blocks], %[blocks], #0x1\n"
                        "add %[b_ptr0], %[b_ptr0], #0x10\n"
                        "ldr s0, [%[a_ptr0]]\n"
                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
                        "ldr s1, [a_ptr1]\n"
                        "add a_ptr1, a_ptr1, #0x4\n"
                        "fmla v24.4s, v16.4s, v0.s[0]\n"
                        "ldr s2, [a_ptr2]\n"
                        "fmla v25.4s, v16.4s, v1.s[0]\n"
                        "add a_ptr2, a_ptr2, #0x4\n"
                        "ldr s3, [a_ptr3]\n"
                        "fmla v26.4s, v16.4s, v2.s[0]\n"
                        "add a_ptr3, a_ptr3, #0x4\n"
                        "fmla v27.4s, v16.4s, v3.s[0]\n"
                        "b.ne 6b\n"
                        "5:\n"
                        "ld1r {v22.4s}, [%[minptr]]\n"
                        "ld1r {v23.4s}, [%[maxptr]]\n"
                        "fmax v24.4s, v24.4s, v22.4s\n"
                        "fmax v25.4s, v25.4s, v22.4s\n"
                        "fmax v26.4s, v26.4s, v22.4s\n"
                        "fmax v27.4s, v27.4s, v22.4s\n"
                        "fmin v24.4s, v24.4s, v23.4s\n"
                        "fmin v25.4s, v25.4s, v23.4s\n"
                        "fmin v26.4s, v26.4s, v23.4s\n"
                        "fmin v27.4s, v27.4s, v23.4s\n"
                        "str q24, [%[c_ptr0]]\n"
                        "add %[c_ptr0], %[c_ptr0], #0x10\n"
                        "str q25, [c_ptr1]\n"
                        "str q26, [c_ptr2]\n"
                        "str q27, [c_ptr3]\n"
                        ".unreq a_ptr1\n"
                        ".unreq a_ptr2\n"
                        ".unreq a_ptr3\n"
                        ".unreq c_ptr1\n"
                        ".unreq c_ptr2\n"
                        ".unreq c_ptr3\n"
                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
                    );
                    break;
                case 5:
                    __asm __volatile (
                        "a_ptr1 .req X0\n"
                        "a_ptr2 .req X1\n"
                        "a_ptr3 .req X2\n"
                        "a_ptr4 .req X3\n"
                        "c_ptr1 .req X4\n"
                        "c_ptr2 .req X5\n"
                        "c_ptr3 .req X6\n"
                        "c_ptr4 .req X7\n"
                        "ldr q24, [%[biasptr]]\n"
                        "add a_ptr1, %[a_ptr0], %[lda]\n"
                        "ldr q0, [%[a_ptr0]]\n"
                        "add a_ptr2, a_ptr1, %[lda]\n"
                        "mov v25.16b, v24.16b\n"
                        "ldr q1, [a_ptr1]\n"
                        "mov v26.16b, v24.16b\n"
                        "ldr q2, [a_ptr2]\n"
                        "mov v27.16b, v24.16b\n"
                        "ldr q16, [%[b_ptr0]]\n"
                        "mov v28.16b, v24.16b\n"
                        "ldr q17, [%[b_ptr0], #0x10]\n"
                        "ldr q18, [%[b_ptr0], #0x20]\n"
                        "add a_ptr3, a_ptr2, %[lda]\n"
                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
                        "ldr q3, [a_ptr3]\n"
                        "add a_ptr4, a_ptr3, %[lda]\n"
                        "add c_ptr2, c_ptr1, %[ldc]\n"
                        "ldr q4, [a_ptr4]\n"
                        "add c_ptr3, c_ptr2, %[ldc]\n"
                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
                        "add c_ptr4, c_ptr3, %[ldc]\n"
                        "add a_ptr1, a_ptr1, #0x10\n"
                        "add a_ptr2, a_ptr2, #0x10\n"
                        "add a_ptr3, a_ptr3, #0x10\n"
                        "add a_ptr4, a_ptr4, #0x10\n"
                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
                        "cbz %[loops], 1f\n"
                        "2:\n"
                        "fmla v24.4s, v16.4s, v0.s[0]\n"
                        "ldr q19, [%[b_ptr0], #-0x10]\n"
                        "fmla v25.4s, v16.4s, v1.s[0]\n"
                        "ldr q8, [%[a_ptr0]]\n"
                        "fmla v26.4s, v16.4s, v2.s[0]\n"
                        "ldr q9, [a_ptr1]\n"
                        "fmla v27.4s, v16.4s, v3.s[0]\n"
                        "ldr q10, [a_ptr2]\n"
                        "fmla v28.4s, v16.4s, v4.s[0]\n"
                        "ldr q11, [a_ptr3]\n"
                        "fmla v24.4s, v17.4s, v0.s[1]\n"
                        "ldr q12, [a_ptr4]\n"
                        "fmla v25.4s, v17.4s, v1.s[1]\n"
                        "ldr q16, [%[b_ptr0]]\n"
                        "fmla v26.4s, v17.4s, v2.s[1]\n"
                        "subs %[loops], %[loops], #0x1\n"
                        "fmla v27.4s, v17.4s, v3.s[1]\n"
                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
                        "fmla v28.4s, v17.4s, v4.s[1]\n"
                        "ldr q17, [%[b_ptr0], #0x10]\n"
                        "fmla v24.4s, v18.4s, v0.s[2]\n"
                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
                        "fmla v25.4s, v18.4s, v1.s[2]\n"
                        "add a_ptr1, a_ptr1, #0x20\n"
                        "fmla v26.4s, v18.4s, v2.s[2]\n"
                        "add a_ptr2, a_ptr2, #0x20\n"
                        "fmla v27.4s, v18.4s, v3.s[2]\n"
                        "add a_ptr3, a_ptr3, #0x20\n"
                        "fmla v28.4s, v18.4s, v4.s[2]\n"
                        "ldr q18, [%[b_ptr0], #0x20]\n"
                        "fmla v24.4s, v19.4s, v0.s[3]\n"
                        "ldr q0, [%[a_ptr0], #-0x10]\n"
                        "fmla v25.4s, v19.4s, v1.s[3]\n"
                        "ldr q1, [a_ptr1, #-0x10]\n"
                        "fmla v26.4s, v19.4s, v2.s[3]\n"
                        "ldr q2, [a_ptr2, #-0x10]\n"
                        "fmla v27.4s, v19.4s, v3.s[3]\n"
                        "ldr q3, [a_ptr3, #-0x10]\n"
                        "fmla v28.4s, v19.4s, v4.s[3]\n"
                        "ldr q19, [%[b_ptr0], #0x30]\n"
                        "fmla v24.4s, v16.4s, v8.s[0]\n"
                        "add a_ptr4, a_ptr4, #0x20\n"
                        "fmla v25.4s, v16.4s, v9.s[0]\n"
                        "ldr q4, [a_ptr4, #-0x10]\n"
                        "fmla v26.4s, v16.4s, v10.s[0]\n"
                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
                        "fmla v27.4s, v16.4s, v11.s[0]\n"
                        "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
                        "fmla v28.4s, v16.4s, v12.s[0]\n"
                        "ldr q16, [%[b_ptr0], #0x40]\n"
                        "fmla v24.4s, v17.4s, v8.s[1]\n"
                        "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
                        "fmla v25.4s, v17.4s, v9.s[1]\n"
                        "fmla v26.4s, v17.4s, v10.s[1]\n"
                        "fmla v27.4s, v17.4s, v11.s[1]\n"
                        "fmla v28.4s, v17.4s, v12.s[1]\n"
                        "ldr q17, [%[b_ptr0], #0x50]\n"
                        "fmla v24.4s, v18.4s, v8.s[2]\n"
                        "fmla v25.4s, v18.4s, v9.s[2]\n"
                        "fmla v26.4s, v18.4s, v10.s[2]\n"
                        "fmla v27.4s, v18.4s, v11.s[2]\n"
                        "fmla v28.4s, v18.4s, v12.s[2]\n"
                        "ldr q18, [%[b_ptr0], #0x60]\n"
                        "fmla v24.4s, v19.4s, v8.s[3]\n"
                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
                        "fmla v25.4s, v19.4s, v9.s[3]\n"
                        "fmla v26.4s, v19.4s, v10.s[3]\n"
                        "fmla v27.4s, v19.4s, v11.s[3]\n"
                        "fmla v28.4s, v19.4s, v12.s[3]\n"
                        "b.ne 2b\n"
                        "1:\n"
                        "ldr q19, [%[b_ptr0], #-0x10]\n"
                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
                        "prfm PSTL1KEEP, [c_ptr1]\n"
                        "prfm PSTL1KEEP, [c_ptr2]\n"
                        "prfm PSTL1KEEP, [c_ptr3]\n"
                        "prfm PSTL1KEEP, [c_ptr4]\n"
                        "cbz %[regs], 3f\n"
                        "fmla v24.4s, v16.4s, v0.s[0]\n"
                        "ldr q8, [%[a_ptr0]]\n"
                        "fmla v25.4s, v16.4s, v1.s[0]\n"
                        "ldr q9, [a_ptr1]\n"
                        "fmla v26.4s, v16.4s, v2.s[0]\n"
                        "ldr q10, [a_ptr2]\n"
                        "fmla v27.4s, v16.4s, v3.s[0]\n"
                        "ldr q11, [a_ptr3]\n"
                        "fmla v28.4s, v16.4s, v4.s[0]\n"
                        "ldr q12, [a_ptr4]\n"
                        "fmla v24.4s, v17.4s, v0.s[1]\n"
                        "ldr q16, [%[b_ptr0]]\n"
                        "fmla v25.4s, v17.4s, v1.s[1]\n"
                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
                        "fmla v26.4s, v17.4s, v2.s[1]\n"
                        "add a_ptr1, a_ptr1, #0x10\n"
                        "fmla v27.4s, v17.4s, v3.s[1]\n"
                        "add a_ptr2, a_ptr2, #0x10\n"
                        "fmla v28.4s, v17.4s, v4.s[1]\n"
                        "ldr q17, [%[b_ptr0], #0x10]\n"
                        "fmla v24.4s, v18.4s, v0.s[2]\n"
                        "add a_ptr3, a_ptr3, #0x10\n"
                        "fmla v25.4s, v18.4s, v1.s[2]\n"
                        "add a_ptr4, a_ptr4, #0x10\n"
                        "fmla v26.4s, v18.4s, v2.s[2]\n"
                        "fmla v27.4s, v18.4s, v3.s[2]\n"
                        "fmla v28.4s, v18.4s, v4.s[2]\n"
                        "ldr q18, [%[b_ptr0], #0x20]\n"
                        "fmla v24.4s, v19.4s, v0.s[3]\n"
                        "fmla v25.4s, v19.4s, v1.s[3]\n"
                        "fmla v26.4s, v19.4s, v2.s[3]\n"
                        "fmla v27.4s, v19.4s, v3.s[3]\n"
                        "fmla v28.4s, v19.4s, v4.s[3]\n"
                        "ldr q19, [%[b_ptr0], #0x30]\n"
                        "fmla v24.4s, v16.4s, v8.s[0]\n"
                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
                        "fmla v25.4s, v16.4s, v9.s[0]\n"
                        "fmla v26.4s, v16.4s, v10.s[0]\n"
                        "fmla v27.4s, v16.4s, v11.s[0]\n"
                        "fmla v28.4s, v16.4s, v12.s[0]\n"
                        "fmla v24.4s, v17.4s, v8.s[1]\n"
                        "fmla v25.4s, v17.4s, v9.s[1]\n"
                        "fmla v26.4s, v17.4s, v10.s[1]\n"
                        "fmla v27.4s, v17.4s, v11.s[1]\n"
                        "fmla v28.4s, v17.4s, v12.s[1]\n"
                        "fmla v24.4s, v18.4s, v8.s[2]\n"
                        "fmla v25.4s, v18.4s, v9.s[2]\n"
                        "fmla v26.4s, v18.4s, v10.s[2]\n"
                        "fmla v27.4s, v18.4s, v11.s[2]\n"
                        "fmla v28.4s, v18.4s, v12.s[2]\n"
                        "fmla v24.4s, v19.4s, v8.s[3]\n"
                        "fmla v25.4s, v19.4s, v9.s[3]\n"
                        "fmla v26.4s, v19.4s, v10.s[3]\n"
                        "fmla v27.4s, v19.4s, v11.s[3]\n"
                        "fmla v28.4s, v19.4s, v12.s[3]\n"
                        "b 4f\n"
                        "3:\n"
                        "fmla v24.4s, v16.4s, v0.s[0]\n"
                        "fmla v25.4s, v16.4s, v1.s[0]\n"
                        "fmla v26.4s, v16.4s, v2.s[0]\n"
                        "fmla v27.4s, v16.4s, v3.s[0]\n"
                        "fmla v28.4s, v16.4s, v4.s[0]\n"
                        "fmla v24.4s, v17.4s, v0.s[1]\n"
                        "fmla v25.4s, v17.4s, v1.s[1]\n"
                        "fmla v26.4s, v17.4s, v2.s[1]\n"
                        "fmla v27.4s, v17.4s, v3.s[1]\n"
                        "fmla v28.4s, v17.4s, v4.s[1]\n"
                        "fmla v24.4s, v18.4s, v0.s[2]\n"
                        "fmla v25.4s, v18.4s, v1.s[2]\n"
                        "fmla v26.4s, v18.4s, v2.s[2]\n"
                        "fmla v27.4s, v18.4s, v3.s[2]\n"
                        "fmla v28.4s, v18.4s, v4.s[2]\n"
                        "fmla v24.4s, v19.4s, v0.s[3]\n"
                        "fmla v25.4s, v19.4s, v1.s[3]\n"
                        "fmla v26.4s, v19.4s, v2.s[3]\n"
                        "fmla v27.4s, v19.4s, v3.s[3]\n"
                        "fmla v28.4s, v19.4s, v4.s[3]\n"
                        "4:\n"
                        "cbz %[blocks], 5f\n"
                        "6:\n"
                        "ldr q16, [%[b_ptr0]]\n"
                        "subs %[blocks], %[blocks], #0x1\n"
                        "add %[b_ptr0], %[b_ptr0], #0x10\n"
                        "ldr s0, [%[a_ptr0]]\n"
                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
                        "ldr s1, [a_ptr1]\n"
                        "add a_ptr1, a_ptr1, #0x4\n"
                        "fmla v24.4s, v16.4s, v0.s[0]\n"
                        "ldr s2, [a_ptr2]\n"
                        "fmla v25.4s, v16.4s, v1.s[0]\n"
                        "add a_ptr2, a_ptr2, #0x4\n"
                        "ldr s3, [a_ptr3]\n"
                        "fmla v26.4s, v16.4s, v2.s[0]\n"
                        "add a_ptr3, a_ptr3, #0x4\n"
                        "ldr s4, [a_ptr4]\n"
                        "fmla v27.4s, v16.4s, v3.s[0]\n"
                        "add a_ptr4, a_ptr4, #0x4\n"
                        "fmla v28.4s, v16.4s, v4.s[0]\n"
                        "b.ne 6b\n"
                        "5:\n"
                        "ld1r {v22.4s}, [%[minptr]]\n"
                        "ld1r {v23.4s}, [%[maxptr]]\n"
                        "fmax v24.4s, v24.4s, v22.4s\n"
                        "fmax v25.4s, v25.4s, v22.4s\n"
                        "fmax v26.4s, v26.4s, v22.4s\n"
                        "fmax v27.4s, v27.4s, v22.4s\n"
                        "fmin v24.4s, v24.4s, v23.4s\n"
                        "fmin v25.4s, v25.4s, v23.4s\n"
                        "fmin v26.4s, v26.4s, v23.4s\n"
                        "fmin v27.4s, v27.4s, v23.4s\n"
                        "str q24, [%[c_ptr0]]\n"
                        "fmax v28.4s, v28.4s, v22.4s\n"
                        "add %[c_ptr0], %[c_ptr0], #0x10\n"
                        "str q25, [c_ptr1]\n"
                        "fmin v28.4s, v28.4s, v23.4s\n"
                        "str q26, [c_ptr2]\n"
                        "str q27, [c_ptr3]\n"
                        "str q28, [c_ptr4]\n"
                        ".unreq a_ptr1\n"
                        ".unreq a_ptr2\n"
                        ".unreq a_ptr3\n"
                        ".unreq a_ptr4\n"
                        ".unreq c_ptr1\n"
                        ".unreq c_ptr2\n"
                        ".unreq c_ptr3\n"
                        ".unreq c_ptr4\n"
                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "cc", "memory"
                    );
                    break;
                case 6:
                    __asm __volatile (
                        "a_ptr1 .req X0\n"
                        "a_ptr2 .req X1\n"
                        "a_ptr3 .req X2\n"
                        "a_ptr4 .req X3\n"
                        "a_ptr5 .req X4\n"
                        "c_ptr1 .req X5\n"
                        "c_ptr2 .req X6\n"
                        "c_ptr3 .req X7\n"
                        "c_ptr4 .req X8\n"
                        "c_ptr5 .req X9\n"
                        "ldr q24, [%[biasptr]]\n"
                        "add a_ptr1, %[a_ptr0], %[lda]\n"
                        "ldr q0, [%[a_ptr0]]\n"
                        "add a_ptr2, a_ptr1, %[lda]\n"
                        "mov v25.16b, v24.16b\n"
                        "ldr q1, [a_ptr1]\n"
                        "mov v26.16b, v24.16b\n"
                        "ldr q2, [a_ptr2]\n"
                        "mov v27.16b, v24.16b\n"
                        "ldr q16, [%[b_ptr0]]\n"
                        "mov v28.16b, v24.16b\n"
                        "ldr q17, [%[b_ptr0], #0x10]\n"
                        "mov v29.16b, v24.16b\n"
                        "ldr q18, [%[b_ptr0], #0x20]\n"
                        "add a_ptr3, a_ptr2, %[lda]\n"
                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
                        "ldr q3, [a_ptr3]\n"
                        "add a_ptr4, a_ptr3, %[lda]\n"
                        "add c_ptr2, c_ptr1, %[ldc]\n"
                        "ldr q4, [a_ptr4]\n"
                        "add a_ptr5, a_ptr4, %[lda]\n"
                        "add c_ptr3, c_ptr2, %[ldc]\n"
                        "ldr q5, [a_ptr5]\n"
                        "add c_ptr4, c_ptr3, %[ldc]\n"
                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
                        "add c_ptr5, c_ptr4, %[ldc]\n"
                        "add a_ptr1, a_ptr1, #0x10\n"
                        "add a_ptr2, a_ptr2, #0x10\n"
                        "add a_ptr3, a_ptr3, #0x10\n"
                        "add a_ptr4, a_ptr4, #0x10\n"
                        "add a_ptr5, a_ptr5, #0x10\n"
                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
                        "cbz %[loops], 1f\n"
                        "2:\n"
                        "fmla v24.4s, v16.4s, v0.s[0]\n"
                        "ldr q19, [%[b_ptr0], #-0x10]\n"
                        "fmla v25.4s, v16.4s, v1.s[0]\n"
                        "ldr q8, [%[a_ptr0]]\n"
                        "fmla v26.4s, v16.4s, v2.s[0]\n"
                        "ldr q9, [a_ptr1]\n"
                        "fmla v27.4s, v16.4s, v3.s[0]\n"
                        "ldr q10, [a_ptr2]\n"
                        "fmla v28.4s, v16.4s, v4.s[0]\n"
                        "ldr q11, [a_ptr3]\n"
                        "fmla v29.4s, v16.4s, v5.s[0]\n"
                        "ldr q12, [a_ptr4]\n"
                        "fmla v24.4s, v17.4s, v0.s[1]\n"
                        "ldr q13, [a_ptr5]\n"
                        "fmla v25.4s, v17.4s, v1.s[1]\n"
                        "ldr q16, [%[b_ptr0]]\n"
                        "fmla v26.4s, v17.4s, v2.s[1]\n"
                        "subs %[loops], %[loops], #0x1\n"
                        "fmla v27.4s, v17.4s, v3.s[1]\n"
                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
                        "fmla v28.4s, v17.4s, v4.s[1]\n"
                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
                        "fmla v29.4s, v17.4s, v5.s[1]\n"
                        "ldr q17, [%[b_ptr0], #0x10]\n"
                        "fmla v24.4s, v18.4s, v0.s[2]\n"
                        "add a_ptr1, a_ptr1, #0x20\n"
                        "fmla v25.4s, v18.4s, v1.s[2]\n"
                        "add a_ptr2, a_ptr2, #0x20\n"
                        "fmla v26.4s, v18.4s, v2.s[2]\n"
                        "add a_ptr3, a_ptr3, #0x20\n"
                        "fmla v27.4s, v18.4s, v3.s[2]\n"
                        "add a_ptr4, a_ptr4, #0x20\n"
                        "fmla v28.4s, v18.4s, v4.s[2]\n"
                        "add a_ptr5, a_ptr5, #0x20\n"
                        "fmla v29.4s, v18.4s, v5.s[2]\n"
                        "ldr q18, [%[b_ptr0], #0x20]\n"
                        "fmla v24.4s, v19.4s, v0.s[3]\n"
                        "ldr q0, [%[a_ptr0], #-0x10]\n"
                        "fmla v25.4s, v19.4s, v1.s[3]\n"
                        "ldr q1, [a_ptr1, #-0x10]\n"
                        "fmla v26.4s, v19.4s, v2.s[3]\n"
                        "ldr q2, [a_ptr2, #-0x10]\n"
                        "fmla v27.4s, v19.4s, v3.s[3]\n"
                        "ldr q3, [a_ptr3, #-0x10]\n"
                        "fmla v28.4s, v19.4s, v4.s[3]\n"
                        "ldr q4, [a_ptr4, #-0x10]\n"
                        "fmla v29.4s, v19.4s, v5.s[3]\n"
                        "ldr q19, [%[b_ptr0], #0x30]\n"
                        "fmla v24.4s, v16.4s, v8.s[0]\n"
                        "ldr q5, [a_ptr5, #-0x10]\n"
                        "fmla v25.4s, v16.4s, v9.s[0]\n"
                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
                        "fmla v26.4s, v16.4s, v10.s[0]\n"
                        "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
                        "fmla v27.4s, v16.4s, v11.s[0]\n"
                        "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
                        "fmla v28.4s, v16.4s, v12.s[0]\n"
                        "fmla v29.4s, v16.4s, v13.s[0]\n"
                        "ldr q16, [%[b_ptr0], #0x40]\n"
                        "fmla v24.4s, v17.4s, v8.s[1]\n"
                        "fmla v25.4s, v17.4s, v9.s[1]\n"
                        "fmla v26.4s, v17.4s, v10.s[1]\n"
                        "fmla v27.4s, v17.4s, v11.s[1]\n"
                        "fmla v28.4s, v17.4s, v12.s[1]\n"
                        "fmla v29.4s, v17.4s, v13.s[1]\n"
                        "ldr q17, [%[b_ptr0], #0x50]\n"
                        "fmla v24.4s, v18.4s, v8.s[2]\n"
                        "fmla v25.4s, v18.4s, v9.s[2]\n"
                        "fmla v26.4s, v18.4s, v10.s[2]\n"
                        "fmla v27.4s, v18.4s, v11.s[2]\n"
                        "fmla v28.4s, v18.4s, v12.s[2]\n"
                        "fmla v29.4s, v18.4s, v13.s[2]\n"
                        "ldr q18, [%[b_ptr0], #0x60]\n"
                        "fmla v24.4s, v19.4s, v8.s[3]\n"
                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
                        "fmla v25.4s, v19.4s, v9.s[3]\n"
                        "fmla v26.4s, v19.4s, v10.s[3]\n"
                        "fmla v27.4s, v19.4s, v11.s[3]\n"
                        "fmla v28.4s, v19.4s, v12.s[3]\n"
                        "fmla v29.4s, v19.4s, v13.s[3]\n"
                        "b.ne 2b\n"
                        "1:\n"
                        "ldr q19, [%[b_ptr0], #-0x10]\n"
                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
                        "prfm PSTL1KEEP, [c_ptr1]\n"
                        "prfm PSTL1KEEP, [c_ptr2]\n"
                        "prfm PSTL1KEEP, [c_ptr3]\n"
                        "prfm PSTL1KEEP, [c_ptr4]\n"
                        "prfm PSTL1KEEP, [c_ptr5]\n"
                        "cbz %[regs], 3f\n"
                        "fmla v24.4s, v16.4s, v0.s[0]\n"
                        "ldr q8, [%[a_ptr0]]\n"
                        "fmla v25.4s, v16.4s, v1.s[0]\n"
                        "ldr q9, [a_ptr1]\n"
                        "fmla v26.4s, v16.4s, v2.s[0]\n"
                        "ldr q10, [a_ptr2]\n"
                        "fmla v27.4s, v16.4s, v3.s[0]\n"
                        "ldr q11, [a_ptr3]\n"
                        "fmla v28.4s, v16.4s, v4.s[0]\n"
                        "ldr q12, [a_ptr4]\n"
                        "fmla v29.4s, v16.4s, v5.s[0]\n"
                        "ldr q13, [a_ptr5]\n"
                        "fmla v24.4s, v17.4s, v0.s[1]\n"
                        "ldr q16, [%[b_ptr0]]\n"
                        "fmla v25.4s, v17.4s, v1.s[1]\n"
                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
                        "fmla v26.4s, v17.4s, v2.s[1]\n"
                        "add a_ptr1, a_ptr1, #0x10\n"
                        "fmla v27.4s, v17.4s, v3.s[1]\n"
                        "add a_ptr2, a_ptr2, #0x10\n"
                        "fmla v28.4s, v17.4s, v4.s[1]\n"
                        "add a_ptr3, a_ptr3, #0x10\n"
                        "fmla v29.4s, v17.4s, v5.s[1]\n"
                        "ldr q17, [%[b_ptr0], #0x10]\n"
                        "fmla v24.4s, v18.4s, v0.s[2]\n"
                        "add a_ptr4, a_ptr4, #0x10\n"
                        "fmla v25.4s, v18.4s, v1.s[2]\n"
                        "add a_ptr5, a_ptr5, #0x10\n"
                        "fmla v26.4s, v18.4s, v2.s[2]\n"
                        "fmla v27.4s, v18.4s, v3.s[2]\n"
                        "fmla v28.4s, v18.4s, v4.s[2]\n"
                        "fmla v29.4s, v18.4s, v5.s[2]\n"
                        "ldr q18, [%[b_ptr0], #0x20]\n"
                        "fmla v24.4s, v19.4s, v0.s[3]\n"
                        "fmla v25.4s, v19.4s, v1.s[3]\n"
                        "fmla v26.4s, v19.4s, v2.s[3]\n"
                        "fmla v27.4s, v19.4s, v3.s[3]\n"
                        "fmla v28.4s, v19.4s, v4.s[3]\n"
                        "fmla v29.4s, v19.4s, v5.s[3]\n"
                        "ldr q19, [%[b_ptr0], #0x30]\n"
                        "fmla v24.4s, v16.4s, v8.s[0]\n"
                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
                        "fmla v25.4s, v16.4s, v9.s[0]\n"
                        "fmla v26.4s, v16.4s, v10.s[0]\n"
                        "fmla v27.4s, v16.4s, v11.s[0]\n"
                        "fmla v28.4s, v16.4s, v12.s[0]\n"
                        "fmla v29.4s, v16.4s, v13.s[0]\n"
                        "fmla v24.4s, v17.4s, v8.s[1]\n"
                        "fmla v25.4s, v17.4s, v9.s[1]\n"
                        "fmla v26.4s, v17.4s, v10.s[1]\n"
                        "fmla v27.4s, v17.4s, v11.s[1]\n"
                        "fmla v28.4s, v17.4s, v12.s[1]\n"
                        "fmla v29.4s, v17.4s, v13.s[1]\n"
                        "fmla v24.4s, v18.4s, v8.s[2]\n"
                        "fmla v25.4s, v18.4s, v9.s[2]\n"
                        "fmla v26.4s, v18.4s, v10.s[2]\n"
                        "fmla v27.4s, v18.4s, v11.s[2]\n"
                        "fmla v28.4s, v18.4s, v12.s[2]\n"
                        "fmla v29.4s, v18.4s, v13.s[2]\n"
                        "fmla v24.4s, v19.4s, v8.s[3]\n"
                        "fmla v25.4s, v19.4s, v9.s[3]\n"
                        "fmla v26.4s, v19.4s, v10.s[3]\n"
                        "fmla v27.4s, v19.4s, v11.s[3]\n"
                        "fmla v28.4s, v19.4s, v12.s[3]\n"
                        "fmla v29.4s, v19.4s, v13.s[3]\n"
                        "b 4f\n"
                        "3:\n"
                        "fmla v24.4s, v16.4s, v0.s[0]\n"
                        "fmla v25.4s, v16.4s, v1.s[0]\n"
                        "fmla v26.4s, v16.4s, v2.s[0]\n"
                        "fmla v27.4s, v16.4s, v3.s[0]\n"
                        "fmla v28.4s, v16.4s, v4.s[0]\n"
                        "fmla v29.4s, v16.4s, v5.s[0]\n"
                        "fmla v24.4s, v17.4s, v0.s[1]\n"
                        "fmla v25.4s, v17.4s, v1.s[1]\n"
                        "fmla v26.4s, v17.4s, v2.s[1]\n"
                        "fmla v27.4s, v17.4s, v3.s[1]\n"
                        "fmla v28.4s, v17.4s, v4.s[1]\n"
                        "fmla v29.4s, v17.4s, v5.s[1]\n"
                        "fmla v24.4s, v18.4s, v0.s[2]\n"
                        "fmla v25.4s, v18.4s, v1.s[2]\n"
                        "fmla v26.4s, v18.4s, v2.s[2]\n"
                        "fmla v27.4s, v18.4s, v3.s[2]\n"
                        "fmla v28.4s, v18.4s, v4.s[2]\n"
                        "fmla v29.4s, v18.4s, v5.s[2]\n"
                        "fmla v24.4s, v19.4s, v0.s[3]\n"
                        "fmla v25.4s, v19.4s, v1.s[3]\n"
                        "fmla v26.4s, v19.4s, v2.s[3]\n"
                        "fmla v27.4s, v19.4s, v3.s[3]\n"
                        "fmla v28.4s, v19.4s, v4.s[3]\n"
                        "fmla v29.4s, v19.4s, v5.s[3]\n"
                        "4:\n"
                        "cbz %[blocks], 5f\n"
                        "6:\n"
                        "ldr q16, [%[b_ptr0]]\n"
                        "subs %[blocks], %[blocks], #0x1\n"
                        "add %[b_ptr0], %[b_ptr0], #0x10\n"
                        "ldr s0, [%[a_ptr0]]\n"
                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
                        "ldr s1, [a_ptr1]\n"
                        "add a_ptr1, a_ptr1, #0x4\n"
                        "fmla v24.4s, v16.4s, v0.s[0]\n"
                        "ldr s2, [a_ptr2]\n"
                        "fmla v25.4s, v16.4s, v1.s[0]\n"
                        "add a_ptr2, a_ptr2, #0x4\n"
                        "ldr s3, [a_ptr3]\n"
                        "fmla v26.4s, v16.4s, v2.s[0]\n"
                        "add a_ptr3, a_ptr3, #0x4\n"
                        "ldr s4, [a_ptr4]\n"
                        "fmla v27.4s, v16.4s, v3.s[0]\n"
                        "add a_ptr4, a_ptr4, #0x4\n"
                        "ldr s5, [a_ptr5]\n"
                        "fmla v28.4s, v16.4s, v4.s[0]\n"
                        "add a_ptr5, a_ptr5, #0x4\n"
                        "fmla v29.4s, v16.4s, v5.s[0]\n"
                        "b.ne 6b\n"
                        "5:\n"
                        "ld1r {v22.4s}, [%[minptr]]\n"
                        "ld1r {v23.4s}, [%[maxptr]]\n"
                        "fmax v24.4s, v24.4s, v22.4s\n"
                        "fmax v25.4s, v25.4s, v22.4s\n"
                        "fmax v26.4s, v26.4s, v22.4s\n"
                        "fmax v27.4s, v27.4s, v22.4s\n"
                        "fmin v24.4s, v24.4s, v23.4s\n"
                        "fmin v25.4s, v25.4s, v23.4s\n"
                        "fmin v26.4s, v26.4s, v23.4s\n"
                        "fmin v27.4s, v27.4s, v23.4s\n"
                        "str q24, [%[c_ptr0]]\n"
                        "fmax v28.4s, v28.4s, v22.4s\n"
                        "add %[c_ptr0], %[c_ptr0], #0x10\n"
                        "fmax v29.4s, v29.4s, v22.4s\n"
                        "str q25, [c_ptr1]\n"
                        "fmin v28.4s, v28.4s, v23.4s\n"
                        "fmin v29.4s, v29.4s, v23.4s\n"
                        "str q26, [c_ptr2]\n"
                        "str q27, [c_ptr3]\n"
                        "str q28, [c_ptr4]\n"
                        "str q29, [c_ptr5]\n"
                        ".unreq a_ptr1\n"
                        ".unreq a_ptr2\n"
                        ".unreq a_ptr3\n"
                        ".unreq a_ptr4\n"
                        ".unreq a_ptr5\n"
                        ".unreq c_ptr1\n"
                        ".unreq c_ptr2\n"
                        ".unreq c_ptr3\n"
                        ".unreq c_ptr4\n"
                        ".unreq c_ptr5\n"
                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "cc", "memory"
                    );
                    break;
                case 7:
                    __asm __volatile (
                        "a_ptr1 .req X0\n"
                        "a_ptr2 .req X1\n"
                        "a_ptr3 .req X2\n"
                        "a_ptr4 .req X3\n"
                        "a_ptr5 .req X4\n"
                        "a_ptr6 .req X5\n"
                        "c_ptr1 .req X6\n"
                        "c_ptr2 .req X7\n"
                        "c_ptr3 .req X8\n"
                        "c_ptr4 .req X9\n"
                        "c_ptr5 .req X10\n"
                        "c_ptr6 .req X11\n"
                        "ldr q24, [%[biasptr]]\n"
                        "add a_ptr1, %[a_ptr0], %[lda]\n"
                        "ldr q0, [%[a_ptr0]]\n"
                        "add a_ptr2, a_ptr1, %[lda]\n"
                        "mov v25.16b, v24.16b\n"
                        "ldr q1, [a_ptr1]\n"
                        "mov v26.16b, v24.16b\n"
                        "ldr q2, [a_ptr2]\n"
                        "mov v27.16b, v24.16b\n"
                        "ldr q16, [%[b_ptr0]]\n"
                        "mov v28.16b, v24.16b\n"
                        "ldr q17, [%[b_ptr0], #0x10]\n"
                        "mov v29.16b, v24.16b\n"
                        "ldr q18, [%[b_ptr0], #0x20]\n"
                        "mov v30.16b, v24.16b\n"
                        "add a_ptr3, a_ptr2, %[lda]\n"
                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
                        "ldr q3, [a_ptr3]\n"
                        "add a_ptr4, a_ptr3, %[lda]\n"
                        "add c_ptr2, c_ptr1, %[ldc]\n"
                        "ldr q4, [a_ptr4]\n"
                        "add a_ptr5, a_ptr4, %[lda]\n"
                        "add c_ptr3, c_ptr2, %[ldc]\n"
                        "ldr q5, [a_ptr5]\n"
                        "add a_ptr6, a_ptr5, %[lda]\n"
                        "add c_ptr4, c_ptr3, %[ldc]\n"
                        "ldr q6, [a_ptr6]\n"
                        "add c_ptr5, c_ptr4, %[ldc]\n"
                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
                        "add c_ptr6, c_ptr5, %[ldc]\n"
                        "add a_ptr1, a_ptr1, #0x10\n"
                        "add a_ptr2, a_ptr2, #0x10\n"
                        "add a_ptr3, a_ptr3, #0x10\n"
                        "add a_ptr4, a_ptr4, #0x10\n"
                        "add a_ptr5, a_ptr5, #0x10\n"
                        "add a_ptr6, a_ptr6, #0x10\n"
                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
                        "cbz %[loops], 1f\n"
                        "2:\n"
                        "fmla v24.4s, v16.4s, v0.s[0]\n"
                        "ldr q19, [%[b_ptr0], #-0x10]\n"
                        "fmla v25.4s, v16.4s, v1.s[0]\n"
                        "ldr q8, [%[a_ptr0]]\n"
                        "fmla v26.4s, v16.4s, v2.s[0]\n"
                        "ldr q9, [a_ptr1]\n"
                        "fmla v27.4s, v16.4s, v3.s[0]\n"
                        "ldr q10, [a_ptr2]\n"
                        "fmla v28.4s, v16.4s, v4.s[0]\n"
                        "ldr q11, [a_ptr3]\n"
                        "fmla v29.4s, v16.4s, v5.s[0]\n"
                        "ldr q12, [a_ptr4]\n"
                        "fmla v30.4s, v16.4s, v6.s[0]\n"
                        "ldr q13, [a_ptr5]\n"
                        "fmla v24.4s, v17.4s, v0.s[1]\n"
                        "ldr q14, [a_ptr6]\n"
                        "fmla v25.4s, v17.4s, v1.s[1]\n"
                        "ldr q16, [%[b_ptr0]]\n"
                        "fmla v26.4s, v17.4s, v2.s[1]\n"
                        "subs %[loops], %[loops], #0x1\n"
                        "fmla v27.4s, v17.4s, v3.s[1]\n"
                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
                        "fmla v28.4s, v17.4s, v4.s[1]\n"
                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
                        "fmla v29.4s, v17.4s, v5.s[1]\n"
                        "add a_ptr1, a_ptr1, #0x20\n"
                        "fmla v30.4s, v17.4s, v6.s[1]\n"
                        "ldr q17, [%[b_ptr0], #0x10]\n"
                        "fmla v24.4s, v18.4s, v0.s[2]\n"
                        "add a_ptr2, a_ptr2, #0x20\n"
                        "fmla v25.4s, v18.4s, v1.s[2]\n"
                        "add a_ptr3, a_ptr3, #0x20\n"
                        "fmla v26.4s, v18.4s, v2.s[2]\n"
                        "add a_ptr4, a_ptr4, #0x20\n"
                        "fmla v27.4s, v18.4s, v3.s[2]\n"
                        "add a_ptr5, a_ptr5, #0x20\n"
                        "fmla v28.4s, v18.4s, v4.s[2]\n"
                        "add a_ptr6, a_ptr6, #0x20\n"
                        "fmla v29.4s, v18.4s, v5.s[2]\n"
                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
                        "fmla v30.4s, v18.4s, v6.s[2]\n"
                        "ldr q18, [%[b_ptr0], #0x20]\n"
                        "fmla v24.4s, v19.4s, v0.s[3]\n"
                        "ldr q0, [%[a_ptr0], #-0x10]\n"
                        "fmla v25.4s, v19.4s, v1.s[3]\n"
                        "ldr q1, [a_ptr1, #-0x10]\n"
                        "fmla v26.4s, v19.4s, v2.s[3]\n"
                        "ldr q2, [a_ptr2, #-0x10]\n"
                        "fmla v27.4s, v19.4s, v3.s[3]\n"
                        "ldr q3, [a_ptr3, #-0x10]\n"
                        "fmla v28.4s, v19.4s, v4.s[3]\n"
                        "ldr q4, [a_ptr4, #-0x10]\n"
                        "fmla v29.4s, v19.4s, v5.s[3]\n"
                        "ldr q5, [a_ptr5, #-0x10]\n"
                        "fmla v30.4s, v19.4s, v6.s[3]\n"
                        "ldr q19, [%[b_ptr0], #0x30]\n"
                        "fmla v24.4s, v16.4s, v8.s[0]\n"
                        "ldr q6, [a_ptr6, #-0x10]\n"
                        "fmla v25.4s, v16.4s, v9.s[0]\n"
                        "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
                        "fmla v26.4s, v16.4s, v10.s[0]\n"
                        "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
                        "fmla v27.4s, v16.4s, v11.s[0]\n"
                        "fmla v28.4s, v16.4s, v12.s[0]\n"
                        "fmla v29.4s, v16.4s, v13.s[0]\n"
                        "fmla v30.4s, v16.4s, v14.s[0]\n"
                        "ldr q16, [%[b_ptr0], #0x40]\n"
                        "fmla v24.4s, v17.4s, v8.s[1]\n"
                        "fmla v25.4s, v17.4s, v9.s[1]\n"
                        "fmla v26.4s, v17.4s, v10.s[1]\n"
                        "fmla v27.4s, v17.4s, v11.s[1]\n"
                        "fmla v28.4s, v17.4s, v12.s[1]\n"
                        "fmla v29.4s, v17.4s, v13.s[1]\n"
                        "fmla v30.4s, v17.4s, v14.s[1]\n"
                        "ldr q17, [%[b_ptr0], #0x50]\n"
                        "fmla v24.4s, v18.4s, v8.s[2]\n"
                        "fmla v25.4s, v18.4s, v9.s[2]\n"
                        "fmla v26.4s, v18.4s, v10.s[2]\n"
                        "fmla v27.4s, v18.4s, v11.s[2]\n"
                        "fmla v28.4s, v18.4s, v12.s[2]\n"
                        "fmla v29.4s, v18.4s, v13.s[2]\n"
                        "fmla v30.4s, v18.4s, v14.s[2]\n"
                        "ldr q18, [%[b_ptr0], #0x60]\n"
                        "fmla v24.4s, v19.4s, v8.s[3]\n"
                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
                        "fmla v25.4s, v19.4s, v9.s[3]\n"
                        "fmla v26.4s, v19.4s, v10.s[3]\n"
                        "fmla v27.4s, v19.4s, v11.s[3]\n"
                        "fmla v28.4s, v19.4s, v12.s[3]\n"
                        "fmla v29.4s, v19.4s, v13.s[3]\n"
                        "fmla v30.4s, v19.4s, v14.s[3]\n"
                        "b.ne 2b\n"
                        "1:\n"
                        "ldr q19, [%[b_ptr0], #-0x10]\n"
                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
                        "prfm PSTL1KEEP, [c_ptr1]\n"
                        "prfm PSTL1KEEP, [c_ptr2]\n"
                        "prfm PSTL1KEEP, [c_ptr3]\n"
                        "prfm PSTL1KEEP, [c_ptr4]\n"
                        "prfm PSTL1KEEP, [c_ptr5]\n"
                        "prfm PSTL1KEEP, [c_ptr6]\n"
                        "cbz %[regs], 3f\n"
                        "fmla v24.4s, v16.4s, v0.s[0]\n"
                        "ldr q8, [%[a_ptr0]]\n"
                        "fmla v25.4s, v16.4s, v1.s[0]\n"
                        "ldr q9, [a_ptr1]\n"
                        "fmla v26.4s, v16.4s, v2.s[0]\n"
                        "ldr q10, [a_ptr2]\n"
                        "fmla v27.4s, v16.4s, v3.s[0]\n"
                        "ldr q11, [a_ptr3]\n"
                        "fmla v28.4s, v16.4s, v4.s[0]\n"
                        "ldr q12, [a_ptr4]\n"
                        "fmla v29.4s, v16.4s, v5.s[0]\n"
                        "ldr q13, [a_ptr5]\n"
                        "fmla v30.4s, v16.4s, v6.s[0]\n"
                        "ldr q14, [a_ptr6]\n"
                        "fmla v24.4s, v17.4s, v0.s[1]\n"
                        "ldr q16, [%[b_ptr0]]\n"
                        "fmla v25.4s, v17.4s, v1.s[1]\n"
                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
                        "fmla v26.4s, v17.4s, v2.s[1]\n"
                        "add a_ptr1, a_ptr1, #0x10\n"
                        "fmla v27.4s, v17.4s, v3.s[1]\n"
                        "add a_ptr2, a_ptr2, #0x10\n"
                        "fmla v28.4s, v17.4s, v4.s[1]\n"
                        "add a_ptr3, a_ptr3, #0x10\n"
                        "fmla v29.4s, v17.4s, v5.s[1]\n"
                        "add a_ptr4, a_ptr4, #0x10\n"
                        "fmla v30.4s, v17.4s, v6.s[1]\n"
                        "ldr q17, [%[b_ptr0], #0x10]\n"
                        "fmla v24.4s, v18.4s, v0.s[2]\n"
                        "add a_ptr5, a_ptr5, #0x10\n"
                        "fmla v25.4s, v18.4s, v1.s[2]\n"
                        "add a_ptr6, a_ptr6, #0x10\n"
                        "fmla v26.4s, v18.4s, v2.s[2]\n"
                        "fmla v27.4s, v18.4s, v3.s[2]\n"
                        "fmla v28.4s, v18.4s, v4.s[2]\n"
                        "fmla v29.4s, v18.4s, v5.s[2]\n"
                        "fmla v30.4s, v18.4s, v6.s[2]\n"
                        "ldr q18, [%[b_ptr0], #0x20]\n"
                        "fmla v24.4s, v19.4s, v0.s[3]\n"
                        "fmla v25.4s, v19.4s, v1.s[3]\n"
                        "fmla v26.4s, v19.4s, v2.s[3]\n"
                        "fmla v27.4s, v19.4s, v3.s[3]\n"
                        "fmla v28.4s, v19.4s, v4.s[3]\n"
                        "fmla v29.4s, v19.4s, v5.s[3]\n"
                        "fmla v30.4s, v19.4s, v6.s[3]\n"
                        "ldr q19, [%[b_ptr0], #0x30]\n"
                        "fmla v24.4s, v16.4s, v8.s[0]\n"
                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
                        "fmla v25.4s, v16.4s, v9.s[0]\n"
                        "fmla v26.4s, v16.4s, v10.s[0]\n"
                        "fmla v27.4s, v16.4s, v11.s[0]\n"
                        "fmla v28.4s, v16.4s, v12.s[0]\n"
                        "fmla v29.4s, v16.4s, v13.s[0]\n"
                        "fmla v30.4s, v16.4s, v14.s[0]\n"
                        "fmla v24.4s, v17.4s, v8.s[1]\n"
                        "fmla v25.4s, v17.4s, v9.s[1]\n"
                        "fmla v26.4s, v17.4s, v10.s[1]\n"
                        "fmla v27.4s, v17.4s, v11.s[1]\n"
                        "fmla v28.4s, v17.4s, v12.s[1]\n"
                        "fmla v29.4s, v17.4s, v13.s[1]\n"
                        "fmla v30.4s, v17.4s, v14.s[1]\n"
                        "fmla v24.4s, v18.4s, v8.s[2]\n"
                        "fmla v25.4s, v18.4s, v9.s[2]\n"
                        "fmla v26.4s, v18.4s, v10.s[2]\n"
                        "fmla v27.4s, v18.4s, v11.s[2]\n"
                        "fmla v28.4s, v18.4s, v12.s[2]\n"
                        "fmla v29.4s, v18.4s, v13.s[2]\n"
                        "fmla v30.4s, v18.4s, v14.s[2]\n"
                        "fmla v24.4s, v19.4s, v8.s[3]\n"
                        "fmla v25.4s, v19.4s, v9.s[3]\n"
                        "fmla v26.4s, v19.4s, v10.s[3]\n"
                        "fmla v27.4s, v19.4s, v11.s[3]\n"
                        "fmla v28.4s, v19.4s, v12.s[3]\n"
                        "fmla v29.4s, v19.4s, v13.s[3]\n"
                        "fmla v30.4s, v19.4s, v14.s[3]\n"
                        "b 4f\n"
                        "3:\n"
                        "fmla v24.4s, v16.4s, v0.s[0]\n"
                        "fmla v25.4s, v16.4s, v1.s[0]\n"
                        "fmla v26.4s, v16.4s, v2.s[0]\n"
                        "fmla v27.4s, v16.4s, v3.s[0]\n"
                        "fmla v28.4s, v16.4s, v4.s[0]\n"
                        "fmla v29.4s, v16.4s, v5.s[0]\n"
                        "fmla v30.4s, v16.4s, v6.s[0]\n"
                        "fmla v24.4s, v17.4s, v0.s[1]\n"
                        "fmla v25.4s, v17.4s, v1.s[1]\n"
                        "fmla v26.4s, v17.4s, v2.s[1]\n"
                        "fmla v27.4s, v17.4s, v3.s[1]\n"
                        "fmla v28.4s, v17.4s, v4.s[1]\n"
                        "fmla v29.4s, v17.4s, v5.s[1]\n"
                        "fmla v30.4s, v17.4s, v6.s[1]\n"
                        "fmla v24.4s, v18.4s, v0.s[2]\n"
                        "fmla v25.4s, v18.4s, v1.s[2]\n"
                        "fmla v26.4s, v18.4s, v2.s[2]\n"
                        "fmla v27.4s, v18.4s, v3.s[2]\n"
                        "fmla v28.4s, v18.4s, v4.s[2]\n"
                        "fmla v29.4s, v18.4s, v5.s[2]\n"
                        "fmla v30.4s, v18.4s, v6.s[2]\n"
                        "fmla v24.4s, v19.4s, v0.s[3]\n"
                        "fmla v25.4s, v19.4s, v1.s[3]\n"
                        "fmla v26.4s, v19.4s, v2.s[3]\n"
                        "fmla v27.4s, v19.4s, v3.s[3]\n"
                        "fmla v28.4s, v19.4s, v4.s[3]\n"
                        "fmla v29.4s, v19.4s, v5.s[3]\n"
                        "fmla v30.4s, v19.4s, v6.s[3]\n"
                        "4:\n"
                        "cbz %[blocks], 5f\n"
                        "6:\n"
                        "ldr q16, [%[b_ptr0]]\n"
                        "subs %[blocks], %[blocks], #0x1\n"
                        "add %[b_ptr0], %[b_ptr0], #0x10\n"
                        "ldr s0, [%[a_ptr0]]\n"
                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
                        "ldr s1, [a_ptr1]\n"
                        "add a_ptr1, a_ptr1, #0x4\n"
                        "fmla v24.4s, v16.4s, v0.s[0]\n"
                        "ldr s2, [a_ptr2]\n"
                        "fmla v25.4s, v16.4s, v1.s[0]\n"
                        "add a_ptr2, a_ptr2, #0x4\n"
                        "ldr s3, [a_ptr3]\n"
                        "fmla v26.4s, v16.4s, v2.s[0]\n"
                        "add a_ptr3, a_ptr3, #0x4\n"
                        "ldr s4, [a_ptr4]\n"
                        "fmla v27.4s, v16.4s, v3.s[0]\n"
                        "add a_ptr4, a_ptr4, #0x4\n"
                        "ldr s5, [a_ptr5]\n"
                        "fmla v28.4s, v16.4s, v4.s[0]\n"
                        "add a_ptr5, a_ptr5, #0x4\n"
                        "ldr s6, [a_ptr6]\n"
                        "fmla v29.4s, v16.4s, v5.s[0]\n"
                        "add a_ptr6, a_ptr6, #0x4\n"
                        "fmla v30.4s, v16.4s, v6.s[0]\n"
                        "b.ne 6b\n"
                        "5:\n"
                        "ld1r {v22.4s}, [%[minptr]]\n"
                        "ld1r {v23.4s}, [%[maxptr]]\n"
                        "fmax v24.4s, v24.4s, v22.4s\n"
                        "fmax v25.4s, v25.4s, v22.4s\n"
                        "fmax v26.4s, v26.4s, v22.4s\n"
                        "fmax v27.4s, v27.4s, v22.4s\n"
                        "fmin v24.4s, v24.4s, v23.4s\n"
                        "fmin v25.4s, v25.4s, v23.4s\n"
                        "fmin v26.4s, v26.4s, v23.4s\n"
                        "fmin v27.4s, v27.4s, v23.4s\n"
                        "str q24, [%[c_ptr0]]\n"
                        "fmax v28.4s, v28.4s, v22.4s\n"
                        "add %[c_ptr0], %[c_ptr0], #0x10\n"
                        "fmax v29.4s, v29.4s, v22.4s\n"
                        "str q25, [c_ptr1]\n"
                        "fmax v30.4s, v30.4s, v22.4s\n"
                        "fmin v28.4s, v28.4s, v23.4s\n"
                        "fmin v29.4s, v29.4s, v23.4s\n"
                        "str q26, [c_ptr2]\n"
                        "fmin v30.4s, v30.4s, v23.4s\n"
                        "str q27, [c_ptr3]\n"
                        "str q28, [c_ptr4]\n"
                        "str q29, [c_ptr5]\n"
                        "str q30, [c_ptr6]\n"
                        ".unreq a_ptr1\n"
                        ".unreq a_ptr2\n"
                        ".unreq a_ptr3\n"
                        ".unreq a_ptr4\n"
                        ".unreq a_ptr5\n"
                        ".unreq a_ptr6\n"
                        ".unreq c_ptr1\n"
                        ".unreq c_ptr2\n"
                        ".unreq c_ptr3\n"
                        ".unreq c_ptr4\n"
                        ".unreq c_ptr5\n"
                        ".unreq c_ptr6\n"
                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "cc", "memory"
                    );
                    break;
                default:
                case 8:
                    __asm __volatile (
                        "a_ptr1 .req X0\n"
                        "a_ptr2 .req X1\n"
                        "a_ptr3 .req X2\n"
                        "a_ptr4 .req X3\n"
                        "a_ptr5 .req X4\n"
                        "a_ptr6 .req X5\n"
                        "a_ptr7 .req X6\n"
                        "c_ptr1 .req X7\n"
                        "c_ptr2 .req X8\n"
                        "c_ptr3 .req X9\n"
                        "c_ptr4 .req X10\n"
                        "c_ptr5 .req X11\n"
                        "c_ptr6 .req X12\n"
                        "c_ptr7 .req X13\n"
                        "ldr q24, [%[biasptr]]\n"
                        "add a_ptr1, %[a_ptr0], %[lda]\n"
                        "ldr q0, [%[a_ptr0]]\n"
                        "add a_ptr2, a_ptr1, %[lda]\n"
                        "mov v25.16b, v24.16b\n"
                        "ldr q1, [a_ptr1]\n"
                        "mov v26.16b, v24.16b\n"
                        "ldr q2, [a_ptr2]\n"
                        "mov v27.16b, v24.16b\n"
                        "ldr q16, [%[b_ptr0]]\n"
                        "mov v28.16b, v24.16b\n"
                        "ldr q17, [%[b_ptr0], #0x10]\n"
                        "mov v29.16b, v24.16b\n"
                        "ldr q18, [%[b_ptr0], #0x20]\n"
                        "mov v30.16b, v24.16b\n"
                        "add a_ptr3, a_ptr2, %[lda]\n"
                        "mov v31.16b, v24.16b\n"
                        "ldr q3, [a_ptr3]\n"
                        "add a_ptr4, a_ptr3, %[lda]\n"
                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
                        "ldr q4, [a_ptr4]\n"
                        "add a_ptr5, a_ptr4, %[lda]\n"
                        "add c_ptr2, c_ptr1, %[ldc]\n"
                        "ldr q5, [a_ptr5]\n"
                        "add a_ptr6, a_ptr5, %[lda]\n"
                        "add c_ptr3, c_ptr2, %[ldc]\n"
                        "ldr q6, [a_ptr6]\n"
                        "add a_ptr7, a_ptr6, %[lda]\n"
                        "add c_ptr4, c_ptr3, %[ldc]\n"
                        "ldr q7, [a_ptr7]\n"
                        "add c_ptr5, c_ptr4, %[ldc]\n"
                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
                        "add c_ptr6, c_ptr5, %[ldc]\n"
                        "add a_ptr1, a_ptr1, #0x10\n"
                        "add c_ptr7, c_ptr6, %[ldc]\n"
                        "add a_ptr2, a_ptr2, #0x10\n"
                        "add a_ptr3, a_ptr3, #0x10\n"
                        "add a_ptr4, a_ptr4, #0x10\n"
                        "add a_ptr5, a_ptr5, #0x10\n"
                        "add a_ptr6, a_ptr6, #0x10\n"
                        "add a_ptr7, a_ptr7, #0x10\n"
                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
                        "cbz %[loops], 1f\n"
                        "2:\n"
                        "fmla v24.4s, v16.4s, v0.s[0]\n"
                        "ldr q19, [%[b_ptr0], #-0x10]\n"
                        "fmla v25.4s, v16.4s, v1.s[0]\n"
                        "ldr q8, [%[a_ptr0]]\n"
                        "fmla v26.4s, v16.4s, v2.s[0]\n"
                        "ldr q9, [a_ptr1]\n"
                        "fmla v27.4s, v16.4s, v3.s[0]\n"
                        "ldr q10, [a_ptr2]\n"
                        "fmla v28.4s, v16.4s, v4.s[0]\n"
                        "ldr q11, [a_ptr3]\n"
                        "fmla v29.4s, v16.4s, v5.s[0]\n"
                        "ldr q12, [a_ptr4]\n"
                        "fmla v30.4s, v16.4s, v6.s[0]\n"
                        "ldr q13, [a_ptr5]\n"
                        "fmla v31.4s, v16.4s, v7.s[0]\n"
                        "ldr q14, [a_ptr6]\n"
                        "fmla v24.4s, v17.4s, v0.s[1]\n"
                        "ldr q15, [a_ptr7]\n"
                        "fmla v25.4s, v17.4s, v1.s[1]\n"
                        "ldr q16, [%[b_ptr0]]\n"
                        "fmla v26.4s, v17.4s, v2.s[1]\n"
                        "subs %[loops], %[loops], #0x1\n"
                        "fmla v27.4s, v17.4s, v3.s[1]\n"
                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
                        "fmla v28.4s, v17.4s, v4.s[1]\n"
                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
                        "fmla v29.4s, v17.4s, v5.s[1]\n"
                        "add a_ptr1, a_ptr1, #0x20\n"
                        "fmla v30.4s, v17.4s, v6.s[1]\n"
                        "add a_ptr2, a_ptr2, #0x20\n"
                        "fmla v31.4s, v17.4s, v7.s[1]\n"
                        "ldr q17, [%[b_ptr0], #0x10]\n"
                        "fmla v24.4s, v18.4s, v0.s[2]\n"
                        "add a_ptr3, a_ptr3, #0x20\n"
                        "fmla v25.4s, v18.4s, v1.s[2]\n"
                        "add a_ptr4, a_ptr4, #0x20\n"
                        "fmla v26.4s, v18.4s, v2.s[2]\n"
                        "add a_ptr5, a_ptr5, #0x20\n"
                        "fmla v27.4s, v18.4s, v3.s[2]\n"
                        "add a_ptr6, a_ptr6, #0x20\n"
                        "fmla v28.4s, v18.4s, v4.s[2]\n"
                        "add a_ptr7, a_ptr7, #0x20\n"
                        "fmla v29.4s, v18.4s, v5.s[2]\n"
                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
                        "fmla v30.4s, v18.4s, v6.s[2]\n"
                        "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
                        "fmla v31.4s, v18.4s, v7.s[2]\n"
                        "ldr q18, [%[b_ptr0], #0x20]\n"
                        "fmla v24.4s, v19.4s, v0.s[3]\n"
                        "ldr q0, [%[a_ptr0], #-0x10]\n"
                        "fmla v25.4s, v19.4s, v1.s[3]\n"
                        "ldr q1, [a_ptr1, #-0x10]\n"
                        "fmla v26.4s, v19.4s, v2.s[3]\n"
                        "ldr q2, [a_ptr2, #-0x10]\n"
                        "fmla v27.4s, v19.4s, v3.s[3]\n"
                        "ldr q3, [a_ptr3, #-0x10]\n"
                        "fmla v28.4s, v19.4s, v4.s[3]\n"
                        "ldr q4, [a_ptr4, #-0x10]\n"
                        "fmla v29.4s, v19.4s, v5.s[3]\n"
                        "ldr q5, [a_ptr5, #-0x10]\n"
                        "fmla v30.4s, v19.4s, v6.s[3]\n"
                        "ldr q6, [a_ptr6, #-0x10]\n"
                        "fmla v31.4s, v19.4s, v7.s[3]\n"
                        "ldr q19, [%[b_ptr0], #0x30]\n"
                        "fmla v24.4s, v16.4s, v8.s[0]\n"
                        "ldr q7, [a_ptr7, #-0x10]\n"
                        "fmla v25.4s, v16.4s, v9.s[0]\n"
                        "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
                        "fmla v26.4s, v16.4s, v10.s[0]\n"
                        "fmla v27.4s, v16.4s, v11.s[0]\n"
                        "fmla v28.4s, v16.4s, v12.s[0]\n"
                        "fmla v29.4s, v16.4s, v13.s[0]\n"
                        "fmla v30.4s, v16.4s, v14.s[0]\n"
                        "fmla v31.4s, v16.4s, v15.s[0]\n"
                        "ldr q16, [%[b_ptr0], #0x40]\n"
                        "fmla v24.4s, v17.4s, v8.s[1]\n"
                        "fmla v25.4s, v17.4s, v9.s[1]\n"
                        "fmla v26.4s, v17.4s, v10.s[1]\n"
                        "fmla v27.4s, v17.4s, v11.s[1]\n"
                        "fmla v28.4s, v17.4s, v12.s[1]\n"
                        "fmla v29.4s, v17.4s, v13.s[1]\n"
                        "fmla v30.4s, v17.4s, v14.s[1]\n"
                        "fmla v31.4s, v17.4s, v15.s[1]\n"
                        "ldr q17, [%[b_ptr0], #0x50]\n"
                        "fmla v24.4s, v18.4s, v8.s[2]\n"
                        "fmla v25.4s, v18.4s, v9.s[2]\n"
                        "fmla v26.4s, v18.4s, v10.s[2]\n"
                        "fmla v27.4s, v18.4s, v11.s[2]\n"
                        "fmla v28.4s, v18.4s, v12.s[2]\n"
                        "fmla v29.4s, v18.4s, v13.s[2]\n"
                        "fmla v30.4s, v18.4s, v14.s[2]\n"
                        "fmla v31.4s, v18.4s, v15.s[2]\n"
                        "ldr q18, [%[b_ptr0], #0x60]\n"
                        "fmla v24.4s, v19.4s, v8.s[3]\n"
                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
                        "fmla v25.4s, v19.4s, v9.s[3]\n"
                        "fmla v26.4s, v19.4s, v10.s[3]\n"
                        "fmla v27.4s, v19.4s, v11.s[3]\n"
                        "fmla v28.4s, v19.4s, v12.s[3]\n"
                        "fmla v29.4s, v19.4s, v13.s[3]\n"
                        "fmla v30.4s, v19.4s, v14.s[3]\n"
                        "fmla v31.4s, v19.4s, v15.s[3]\n"
                        "b.ne 2b\n"
                        "1:\n"
                        "ldr q19, [%[b_ptr0], #-0x10]\n"
                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
                        "prfm PSTL1KEEP, [c_ptr1]\n"
                        "prfm PSTL1KEEP, [c_ptr2]\n"
                        "prfm PSTL1KEEP, [c_ptr3]\n"
                        "prfm PSTL1KEEP, [c_ptr4]\n"
                        "prfm PSTL1KEEP, [c_ptr5]\n"
                        "prfm PSTL1KEEP, [c_ptr6]\n"
                        "prfm PSTL1KEEP, [c_ptr7]\n"
                        "cbz %[regs], 3f\n"
                        "fmla v24.4s, v16.4s, v0.s[0]\n"
                        "ldr q8, [%[a_ptr0]]\n"
                        "fmla v25.4s, v16.4s, v1.s[0]\n"
                        "ldr q9, [a_ptr1]\n"
                        "fmla v26.4s, v16.4s, v2.s[0]\n"
                        "ldr q10, [a_ptr2]\n"
                        "fmla v27.4s, v16.4s, v3.s[0]\n"
                        "ldr q11, [a_ptr3]\n"
                        "fmla v28.4s, v16.4s, v4.s[0]\n"
                        "ldr q12, [a_ptr4]\n"
                        "fmla v29.4s, v16.4s, v5.s[0]\n"
                        "ldr q13, [a_ptr5]\n"
                        "fmla v30.4s, v16.4s, v6.s[0]\n"
                        "ldr q14, [a_ptr6]\n"
                        "fmla v31.4s, v16.4s, v7.s[0]\n"
                        "ldr q15, [a_ptr7]\n"
                        "fmla v24.4s, v17.4s, v0.s[1]\n"
                        "ldr q16, [%[b_ptr0]]\n"
                        "fmla v25.4s, v17.4s, v1.s[1]\n"
                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
                        "fmla v26.4s, v17.4s, v2.s[1]\n"
                        "add a_ptr1, a_ptr1, #0x10\n"
                        "fmla v27.4s, v17.4s, v3.s[1]\n"
                        "add a_ptr2, a_ptr2, #0x10\n"
                        "fmla v28.4s, v17.4s, v4.s[1]\n"
                        "add a_ptr3, a_ptr3, #0x10\n"
                        "fmla v29.4s, v17.4s, v5.s[1]\n"
                        "add a_ptr4, a_ptr4, #0x10\n"
                        "fmla v30.4s, v17.4s, v6.s[1]\n"
                        "add a_ptr5, a_ptr5, #0x10\n"
                        "fmla v31.4s, v17.4s, v7.s[1]\n"
                        "ldr q17, [%[b_ptr0], #0x10]\n"
                        "fmla v24.4s, v18.4s, v0.s[2]\n"
                        "add a_ptr6, a_ptr6, #0x10\n"
                        "fmla v25.4s, v18.4s, v1.s[2]\n"
                        "add a_ptr7, a_ptr7, #0x10\n"
                        "fmla v26.4s, v18.4s, v2.s[2]\n"
                        "fmla v27.4s, v18.4s, v3.s[2]\n"
                        "fmla v28.4s, v18.4s, v4.s[2]\n"
                        "fmla v29.4s, v18.4s, v5.s[2]\n"
                        "fmla v30.4s, v18.4s, v6.s[2]\n"
                        "fmla v31.4s, v18.4s, v7.s[2]\n"
                        "ldr q18, [%[b_ptr0], #0x20]\n"
                        "fmla v24.4s, v19.4s, v0.s[3]\n"
                        "fmla v25.4s, v19.4s, v1.s[3]\n"
                        "fmla v26.4s, v19.4s, v2.s[3]\n"
                        "fmla v27.4s, v19.4s, v3.s[3]\n"
                        "fmla v28.4s, v19.4s, v4.s[3]\n"
                        "fmla v29.4s, v19.4s, v5.s[3]\n"
                        "fmla v30.4s, v19.4s, v6.s[3]\n"
                        "fmla v31.4s, v19.4s, v7.s[3]\n"
                        "ldr q19, [%[b_ptr0], #0x30]\n"
                        "fmla v24.4s, v16.4s, v8.s[0]\n"
                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
                        "fmla v25.4s, v16.4s, v9.s[0]\n"
                        "fmla v26.4s, v16.4s, v10.s[0]\n"
                        "fmla v27.4s, v16.4s, v11.s[0]\n"
                        "fmla v28.4s, v16.4s, v12.s[0]\n"
                        "fmla v29.4s, v16.4s, v13.s[0]\n"
                        "fmla v30.4s, v16.4s, v14.s[0]\n"
                        "fmla v31.4s, v16.4s, v15.s[0]\n"
                        "fmla v24.4s, v17.4s, v8.s[1]\n"
                        "fmla v25.4s, v17.4s, v9.s[1]\n"
                        "fmla v26.4s, v17.4s, v10.s[1]\n"
                        "fmla v27.4s, v17.4s, v11.s[1]\n"
                        "fmla v28.4s, v17.4s, v12.s[1]\n"
                        "fmla v29.4s, v17.4s, v13.s[1]\n"
                        "fmla v30.4s, v17.4s, v14.s[1]\n"
                        "fmla v31.4s, v17.4s, v15.s[1]\n"
                        "fmla v24.4s, v18.4s, v8.s[2]\n"
                        "fmla v25.4s, v18.4s, v9.s[2]\n"
                        "fmla v26.4s, v18.4s, v10.s[2]\n"
                        "fmla v27.4s, v18.4s, v11.s[2]\n"
                        "fmla v28.4s, v18.4s, v12.s[2]\n"
                        "fmla v29.4s, v18.4s, v13.s[2]\n"
                        "fmla v30.4s, v18.4s, v14.s[2]\n"
                        "fmla v31.4s, v18.4s, v15.s[2]\n"
                        "fmla v24.4s, v19.4s, v8.s[3]\n"
                        "fmla v25.4s, v19.4s, v9.s[3]\n"
                        "fmla v26.4s, v19.4s, v10.s[3]\n"
                        "fmla v27.4s, v19.4s, v11.s[3]\n"
                        "fmla v28.4s, v19.4s, v12.s[3]\n"
                        "fmla v29.4s, v19.4s, v13.s[3]\n"
                        "fmla v30.4s, v19.4s, v14.s[3]\n"
                        "fmla v31.4s, v19.4s, v15.s[3]\n"
                        "b 4f\n"
                        "3:\n"
                        "fmla v24.4s, v16.4s, v0.s[0]\n"
                        "fmla v25.4s, v16.4s, v1.s[0]\n"
                        "fmla v26.4s, v16.4s, v2.s[0]\n"
                        "fmla v27.4s, v16.4s, v3.s[0]\n"
                        "fmla v28.4s, v16.4s, v4.s[0]\n"
                        "fmla v29.4s, v16.4s, v5.s[0]\n"
                        "fmla v30.4s, v16.4s, v6.s[0]\n"
                        "fmla v31.4s, v16.4s, v7.s[0]\n"
                        "fmla v24.4s, v17.4s, v0.s[1]\n"
                        "fmla v25.4s, v17.4s, v1.s[1]\n"
                        "fmla v26.4s, v17.4s, v2.s[1]\n"
                        "fmla v27.4s, v17.4s, v3.s[1]\n"
                        "fmla v28.4s, v17.4s, v4.s[1]\n"
                        "fmla v29.4s, v17.4s, v5.s[1]\n"
                        "fmla v30.4s, v17.4s, v6.s[1]\n"
                        "fmla v31.4s, v17.4s, v7.s[1]\n"
                        "fmla v24.4s, v18.4s, v0.s[2]\n"
                        "fmla v25.4s, v18.4s, v1.s[2]\n"
                        "fmla v26.4s, v18.4s, v2.s[2]\n"
                        "fmla v27.4s, v18.4s, v3.s[2]\n"
                        "fmla v28.4s, v18.4s, v4.s[2]\n"
                        "fmla v29.4s, v18.4s, v5.s[2]\n"
                        "fmla v30.4s, v18.4s, v6.s[2]\n"
                        "fmla v31.4s, v18.4s, v7.s[2]\n"
                        "fmla v24.4s, v19.4s, v0.s[3]\n"
                        "fmla v25.4s, v19.4s, v1.s[3]\n"
                        "fmla v26.4s, v19.4s, v2.s[3]\n"
                        "fmla v27.4s, v19.4s, v3.s[3]\n"
                        "fmla v28.4s, v19.4s, v4.s[3]\n"
                        "fmla v29.4s, v19.4s, v5.s[3]\n"
                        "fmla v30.4s, v19.4s, v6.s[3]\n"
                        "fmla v31.4s, v19.4s, v7.s[3]\n"
                        "4:\n"
                        "cbz %[blocks], 5f\n"
                        "6:\n"
                        "ldr q16, [%[b_ptr0]]\n"
                        "subs %[blocks], %[blocks], #0x1\n"
                        "add %[b_ptr0], %[b_ptr0], #0x10\n"
                        "ldr s0, [%[a_ptr0]]\n"
                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
                        "ldr s1, [a_ptr1]\n"
                        "add a_ptr1, a_ptr1, #0x4\n"
                        "fmla v24.4s, v16.4s, v0.s[0]\n"
                        "ldr s2, [a_ptr2]\n"
                        "fmla v25.4s, v16.4s, v1.s[0]\n"
                        "add a_ptr2, a_ptr2, #0x4\n"
                        "ldr s3, [a_ptr3]\n"
                        "fmla v26.4s, v16.4s, v2.s[0]\n"
                        "add a_ptr3, a_ptr3, #0x4\n"
                        "ldr s4, [a_ptr4]\n"
                        "fmla v27.4s, v16.4s, v3.s[0]\n"
                        "add a_ptr4, a_ptr4, #0x4\n"
                        "ldr s5, [a_ptr5]\n"
                        "fmla v28.4s, v16.4s, v4.s[0]\n"
                        "add a_ptr5, a_ptr5, #0x4\n"
                        "ldr s6, [a_ptr6]\n"
                        "fmla v29.4s, v16.4s, v5.s[0]\n"
                        "add a_ptr6, a_ptr6, #0x4\n"
                        "ldr s7, [a_ptr7]\n"
                        "fmla v30.4s, v16.4s, v6.s[0]\n"
                        "add a_ptr7, a_ptr7, #0x4\n"
                        "fmla v31.4s, v16.4s, v7.s[0]\n"
                        "b.ne 6b\n"
                        "5:\n"
                        "ld1r {v22.4s}, [%[minptr]]\n"
                        "ld1r {v23.4s}, [%[maxptr]]\n"
                        "fmax v24.4s, v24.4s, v22.4s\n"
                        "fmax v25.4s, v25.4s, v22.4s\n"
                        "fmax v26.4s, v26.4s, v22.4s\n"
                        "fmax v27.4s, v27.4s, v22.4s\n"
                        "fmin v24.4s, v24.4s, v23.4s\n"
                        "fmin v25.4s, v25.4s, v23.4s\n"
                        "fmin v26.4s, v26.4s, v23.4s\n"
                        "fmin v27.4s, v27.4s, v23.4s\n"
                        "str q24, [%[c_ptr0]]\n"
                        "fmax v28.4s, v28.4s, v22.4s\n"
                        "add %[c_ptr0], %[c_ptr0], #0x10\n"
                        "fmax v29.4s, v29.4s, v22.4s\n"
                        "str q25, [c_ptr1]\n"
                        "fmax v30.4s, v30.4s, v22.4s\n"
                        "fmin v28.4s, v28.4s, v23.4s\n"
                        "fmax v31.4s, v31.4s, v22.4s\n"
                        "str q26, [c_ptr2]\n"
                        "fmin v29.4s, v29.4s, v23.4s\n"
                        "fmin v30.4s, v30.4s, v23.4s\n"
                        "fmin v31.4s, v31.4s, v23.4s\n"
                        "str q27, [c_ptr3]\n"
                        "str q28, [c_ptr4]\n"
                        "str q29, [c_ptr5]\n"
                        "str q30, [c_ptr6]\n"
                        "str q31, [c_ptr7]\n"
                        ".unreq a_ptr1\n"
                        ".unreq a_ptr2\n"
                        ".unreq a_ptr3\n"
                        ".unreq a_ptr4\n"
                        ".unreq a_ptr5\n"
                        ".unreq a_ptr6\n"
                        ".unreq a_ptr7\n"
                        ".unreq c_ptr1\n"
                        ".unreq c_ptr2\n"
                        ".unreq c_ptr3\n"
                        ".unreq c_ptr4\n"
                        ".unreq c_ptr5\n"
                        ".unreq c_ptr6\n"
                        ".unreq c_ptr7\n"
                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "cc", "memory"
                    );
                    break;
            }
            if (use_result_buffer) {
                for(int cy=0; cy<std::min(M-y, 8); cy++) {
                    for(unsigned int cx=0; cx<width; cx++) {
                        c_ptr_real[cy * ldc + cx] = result_buffer[cy * 4 + cx];
                    }
                }
            }
        }
    }
}

} // namespace arm_gemm

#endif // __aarch64__
